mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2024-11-30 07:34:12 +08:00
Further restructure ext4 documentation; fix up ext4's delayed
allocation for bigalloc file systems; fix up some syzbot-detected races in EXT4_IOC_MOVE_EXT, EXT4_IOC_SWAP_BOOT, and ext4_remount; and a few other miscellaneous bugs and optimizations. -----BEGIN PGP SIGNATURE----- iQEzBAABCAAdFiEEK2m5VNv+CHkogTfJ8vlZVpUNgaMFAlvQYEcACgkQ8vlZVpUN gaOPYAgAh0BF7mTRnHAp/qkR5ZhDi3ecb3TpNlnpfzoDqQhPYETFisc18DD4HwTj wctwzSdYxYodeuPIK+R2bBzUy3FuSwtlER9cdr1ilcrUYPZHbir1rPPfTNb/oDGx WNcd/aulLjuU1eKDODowqMOF2HDchiJHqJqMBa+LfCHck1x/bt2uqdjNA5A1p5AV lp07DoXT54q5rWJDaXpbxTShWKhzHlRKbB9PKEvMHgPNl9sn5oRReRMKAW+WkT91 e3mfy/GhzhugdWxYUg2oAn3dbqYkkAjW96WnBhCQHioW9ASphjl7yBi1LWh2aPA4 haGxe5W3En8q678ZVtTVNJOyvbW81Q== =VgdS -----END PGP SIGNATURE----- Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4 Pull ext4 updates from Ted Ts'o: - further restructure ext4 documentation - fix up ext4's delayed allocation for bigalloc file systems - fix up some syzbot-detected races in EXT4_IOC_MOVE_EXT, EXT4_IOC_SWAP_BOOT, and ext4_remount - ... and a few other miscellaneous bugs and optimizations. * tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (21 commits) ext4: fix use-after-free race in ext4_remount()'s error path ext4: cache NULL when both default_acl and acl are NULL docs: promote the ext4 data structures book to top level docs: move ext4 administrative docs to admin-guide/ jbd2: fix use after free in jbd2_log_do_checkpoint() ext4: propagate error from dquot_initialize() in EXT4_IOC_FSSETXATTR ext4: fix setattr project check in fssetxattr ioctl docs: make ext4 readme tables readable docs: fix ext4 documentation table formatting problems docs: generate a separate ext4 pdf file from the documentation ext4: convert fault handler to use vm_fault_t type ext4: initialize retries variable in ext4_da_write_inline_data_begin() ext4: fix EXT4_IOC_SWAP_BOOT ext4: fix build error when DX_DEBUG is defined ext4: fix argument checking in EXT4_IOC_MOVE_EXT ext4: fix reserved cluster accounting at page invalidation time ext4: adjust reserved cluster count when removing extents ext4: reduce reserved cluster count by number of allocated clusters ext4: fix reserved cluster accounting at delayed write time ext4: add new pending reservation mechanism ...
This commit is contained in:
commit
5993692f09
574
Documentation/admin-guide/ext4.rst
Normal file
574
Documentation/admin-guide/ext4.rst
Normal file
@ -0,0 +1,574 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
========================
|
||||||
|
ext4 General Information
|
||||||
|
========================
|
||||||
|
|
||||||
|
Ext4 is an advanced level of the ext3 filesystem which incorporates
|
||||||
|
scalability and reliability enhancements for supporting large filesystems
|
||||||
|
(64 bit) in keeping with increasing disk capacities and state-of-the-art
|
||||||
|
feature requirements.
|
||||||
|
|
||||||
|
Mailing list: linux-ext4@vger.kernel.org
|
||||||
|
Web site: http://ext4.wiki.kernel.org
|
||||||
|
|
||||||
|
|
||||||
|
Quick usage instructions
|
||||||
|
========================
|
||||||
|
|
||||||
|
Note: More extensive information for getting started with ext4 can be
|
||||||
|
found at the ext4 wiki site at the URL:
|
||||||
|
http://ext4.wiki.kernel.org/index.php/Ext4_Howto
|
||||||
|
|
||||||
|
- The latest version of e2fsprogs can be found at:
|
||||||
|
|
||||||
|
https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
|
http://sourceforge.net/project/showfiles.php?group_id=2406
|
||||||
|
|
||||||
|
or grab the latest git repository from:
|
||||||
|
|
||||||
|
https://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git
|
||||||
|
|
||||||
|
- Create a new filesystem using the ext4 filesystem type:
|
||||||
|
|
||||||
|
# mke2fs -t ext4 /dev/hda1
|
||||||
|
|
||||||
|
Or to configure an existing ext3 filesystem to support extents:
|
||||||
|
|
||||||
|
# tune2fs -O extents /dev/hda1
|
||||||
|
|
||||||
|
If the filesystem was created with 128 byte inodes, it can be
|
||||||
|
converted to use 256 byte for greater efficiency via:
|
||||||
|
|
||||||
|
# tune2fs -I 256 /dev/hda1
|
||||||
|
|
||||||
|
- Mounting:
|
||||||
|
|
||||||
|
# mount -t ext4 /dev/hda1 /wherever
|
||||||
|
|
||||||
|
- When comparing performance with other filesystems, it's always
|
||||||
|
important to try multiple workloads; very often a subtle change in a
|
||||||
|
workload parameter can completely change the ranking of which
|
||||||
|
filesystems do well compared to others. When comparing versus ext3,
|
||||||
|
note that ext4 enables write barriers by default, while ext3 does
|
||||||
|
not enable write barriers by default. So it is useful to use
|
||||||
|
explicitly specify whether barriers are enabled or not when via the
|
||||||
|
'-o barriers=[0|1]' mount option for both ext3 and ext4 filesystems
|
||||||
|
for a fair comparison. When tuning ext3 for best benchmark numbers,
|
||||||
|
it is often worthwhile to try changing the data journaling mode; '-o
|
||||||
|
data=writeback' can be faster for some workloads. (Note however that
|
||||||
|
running mounted with data=writeback can potentially leave stale data
|
||||||
|
exposed in recently written files in case of an unclean shutdown,
|
||||||
|
which could be a security exposure in some situations.) Configuring
|
||||||
|
the filesystem with a large journal can also be helpful for
|
||||||
|
metadata-intensive workloads.
|
||||||
|
|
||||||
|
Features
|
||||||
|
========
|
||||||
|
|
||||||
|
Currently Available
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
* ability to use filesystems > 16TB (e2fsprogs support not available yet)
|
||||||
|
* extent format reduces metadata overhead (RAM, IO for access, transactions)
|
||||||
|
* extent format more robust in face of on-disk corruption due to magics,
|
||||||
|
* internal redundancy in tree
|
||||||
|
* improved file allocation (multi-block alloc)
|
||||||
|
* lift 32000 subdirectory limit imposed by i_links_count[1]
|
||||||
|
* nsec timestamps for mtime, atime, ctime, create time
|
||||||
|
* inode version field on disk (NFSv4, Lustre)
|
||||||
|
* reduced e2fsck time via uninit_bg feature
|
||||||
|
* journal checksumming for robustness, performance
|
||||||
|
* persistent file preallocation (e.g for streaming media, databases)
|
||||||
|
* ability to pack bitmaps and inode tables into larger virtual groups via the
|
||||||
|
flex_bg feature
|
||||||
|
* large file support
|
||||||
|
* inode allocation using large virtual block groups via flex_bg
|
||||||
|
* delayed allocation
|
||||||
|
* large block (up to pagesize) support
|
||||||
|
* efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force
|
||||||
|
the ordering)
|
||||||
|
|
||||||
|
[1] Filesystems with a block size of 1k may see a limit imposed by the
|
||||||
|
directory hash tree having a maximum depth of two.
|
||||||
|
|
||||||
|
Options
|
||||||
|
=======
|
||||||
|
|
||||||
|
When mounting an ext4 filesystem, the following option are accepted:
|
||||||
|
(*) == default
|
||||||
|
|
||||||
|
ro
|
||||||
|
Mount filesystem read only. Note that ext4 will replay the journal (and
|
||||||
|
thus write to the partition) even when mounted "read only". The mount
|
||||||
|
options "ro,noload" can be used to prevent writes to the filesystem.
|
||||||
|
|
||||||
|
journal_checksum
|
||||||
|
Enable checksumming of the journal transactions. This will allow the
|
||||||
|
recovery code in e2fsck and the kernel to detect corruption in the
|
||||||
|
kernel. It is a compatible change and will be ignored by older
|
||||||
|
kernels.
|
||||||
|
|
||||||
|
journal_async_commit
|
||||||
|
Commit block can be written to disk without waiting for descriptor
|
||||||
|
blocks. If enabled older kernels cannot mount the device. This will
|
||||||
|
enable 'journal_checksum' internally.
|
||||||
|
|
||||||
|
journal_path=path, journal_dev=devnum
|
||||||
|
When the external journal device's major/minor numbers have changed,
|
||||||
|
these options allow the user to specify the new journal location. The
|
||||||
|
journal device is identified through either its new major/minor numbers
|
||||||
|
encoded in devnum, or via a path to the device.
|
||||||
|
|
||||||
|
norecovery, noload
|
||||||
|
Don't load the journal on mounting. Note that if the filesystem was
|
||||||
|
not unmounted cleanly, skipping the journal replay will lead to the
|
||||||
|
filesystem containing inconsistencies that can lead to any number of
|
||||||
|
problems.
|
||||||
|
|
||||||
|
data=journal
|
||||||
|
All data are committed into the journal prior to being written into the
|
||||||
|
main file system. Enabling this mode will disable delayed allocation
|
||||||
|
and O_DIRECT support.
|
||||||
|
|
||||||
|
data=ordered (*)
|
||||||
|
All data are forced directly out to the main file system prior to its
|
||||||
|
metadata being committed to the journal.
|
||||||
|
|
||||||
|
data=writeback
|
||||||
|
Data ordering is not preserved, data may be written into the main file
|
||||||
|
system after its metadata has been committed to the journal.
|
||||||
|
|
||||||
|
commit=nrsec (*)
|
||||||
|
Ext4 can be told to sync all its data and metadata every 'nrsec'
|
||||||
|
seconds. The default value is 5 seconds. This means that if you lose
|
||||||
|
your power, you will lose as much as the latest 5 seconds of work (your
|
||||||
|
filesystem will not be damaged though, thanks to the journaling). This
|
||||||
|
default value (or any low value) will hurt performance, but it's good
|
||||||
|
for data-safety. Setting it to 0 will have the same effect as leaving
|
||||||
|
it at the default (5 seconds). Setting it to very large values will
|
||||||
|
improve performance.
|
||||||
|
|
||||||
|
barrier=<0|1(*)>, barrier(*), nobarrier
|
||||||
|
This enables/disables the use of write barriers in the jbd code.
|
||||||
|
barrier=0 disables, barrier=1 enables. This also requires an IO stack
|
||||||
|
which can support barriers, and if jbd gets an error on a barrier
|
||||||
|
write, it will disable again with a warning. Write barriers enforce
|
||||||
|
proper on-disk ordering of journal commits, making volatile disk write
|
||||||
|
caches safe to use, at some performance penalty. If your disks are
|
||||||
|
battery-backed in one way or another, disabling barriers may safely
|
||||||
|
improve performance. The mount options "barrier" and "nobarrier" can
|
||||||
|
also be used to enable or disable barriers, for consistency with other
|
||||||
|
ext4 mount options.
|
||||||
|
|
||||||
|
inode_readahead_blks=n
|
||||||
|
This tuning parameter controls the maximum number of inode table blocks
|
||||||
|
that ext4's inode table readahead algorithm will pre-read into the
|
||||||
|
buffer cache. The default value is 32 blocks.
|
||||||
|
|
||||||
|
nouser_xattr
|
||||||
|
Disables Extended User Attributes. See the attr(5) manual page for
|
||||||
|
more information about extended attributes.
|
||||||
|
|
||||||
|
noacl
|
||||||
|
This option disables POSIX Access Control List support. If ACL support
|
||||||
|
is enabled in the kernel configuration (CONFIG_EXT4_FS_POSIX_ACL), ACL
|
||||||
|
is enabled by default on mount. See the acl(5) manual page for more
|
||||||
|
information about acl.
|
||||||
|
|
||||||
|
bsddf (*)
|
||||||
|
Make 'df' act like BSD.
|
||||||
|
|
||||||
|
minixdf
|
||||||
|
Make 'df' act like Minix.
|
||||||
|
|
||||||
|
debug
|
||||||
|
Extra debugging information is sent to syslog.
|
||||||
|
|
||||||
|
abort
|
||||||
|
Simulate the effects of calling ext4_abort() for debugging purposes.
|
||||||
|
This is normally used while remounting a filesystem which is already
|
||||||
|
mounted.
|
||||||
|
|
||||||
|
errors=remount-ro
|
||||||
|
Remount the filesystem read-only on an error.
|
||||||
|
|
||||||
|
errors=continue
|
||||||
|
Keep going on a filesystem error.
|
||||||
|
|
||||||
|
errors=panic
|
||||||
|
Panic and halt the machine if an error occurs. (These mount options
|
||||||
|
override the errors behavior specified in the superblock, which can be
|
||||||
|
configured using tune2fs)
|
||||||
|
|
||||||
|
data_err=ignore(*)
|
||||||
|
Just print an error message if an error occurs in a file data buffer in
|
||||||
|
ordered mode.
|
||||||
|
data_err=abort
|
||||||
|
Abort the journal if an error occurs in a file data buffer in ordered
|
||||||
|
mode.
|
||||||
|
|
||||||
|
grpid | bsdgroups
|
||||||
|
New objects have the group ID of their parent.
|
||||||
|
|
||||||
|
nogrpid (*) | sysvgroups
|
||||||
|
New objects have the group ID of their creator.
|
||||||
|
|
||||||
|
resgid=n
|
||||||
|
The group ID which may use the reserved blocks.
|
||||||
|
|
||||||
|
resuid=n
|
||||||
|
The user ID which may use the reserved blocks.
|
||||||
|
|
||||||
|
sb=
|
||||||
|
Use alternate superblock at this location.
|
||||||
|
|
||||||
|
quota, noquota, grpquota, usrquota
|
||||||
|
These options are ignored by the filesystem. They are used only by
|
||||||
|
quota tools to recognize volumes where quota should be turned on. See
|
||||||
|
documentation in the quota-tools package for more details
|
||||||
|
(http://sourceforge.net/projects/linuxquota).
|
||||||
|
|
||||||
|
jqfmt=<quota type>, usrjquota=<file>, grpjquota=<file>
|
||||||
|
These options tell filesystem details about quota so that quota
|
||||||
|
information can be properly updated during journal replay. They replace
|
||||||
|
the above quota options. See documentation in the quota-tools package
|
||||||
|
for more details (http://sourceforge.net/projects/linuxquota).
|
||||||
|
|
||||||
|
stripe=n
|
||||||
|
Number of filesystem blocks that mballoc will try to use for allocation
|
||||||
|
size and alignment. For RAID5/6 systems this should be the number of
|
||||||
|
data disks * RAID chunk size in file system blocks.
|
||||||
|
|
||||||
|
delalloc (*)
|
||||||
|
Defer block allocation until just before ext4 writes out the block(s)
|
||||||
|
in question. This allows ext4 to better allocation decisions more
|
||||||
|
efficiently.
|
||||||
|
|
||||||
|
nodelalloc
|
||||||
|
Disable delayed allocation. Blocks are allocated when the data is
|
||||||
|
copied from userspace to the page cache, either via the write(2) system
|
||||||
|
call or when an mmap'ed page which was previously unallocated is
|
||||||
|
written for the first time.
|
||||||
|
|
||||||
|
max_batch_time=usec
|
||||||
|
Maximum amount of time ext4 should wait for additional filesystem
|
||||||
|
operations to be batch together with a synchronous write operation.
|
||||||
|
Since a synchronous write operation is going to force a commit and then
|
||||||
|
a wait for the I/O complete, it doesn't cost much, and can be a huge
|
||||||
|
throughput win, we wait for a small amount of time to see if any other
|
||||||
|
transactions can piggyback on the synchronous write. The algorithm
|
||||||
|
used is designed to automatically tune for the speed of the disk, by
|
||||||
|
measuring the amount of time (on average) that it takes to finish
|
||||||
|
committing a transaction. Call this time the "commit time". If the
|
||||||
|
time that the transaction has been running is less than the commit
|
||||||
|
time, ext4 will try sleeping for the commit time to see if other
|
||||||
|
operations will join the transaction. The commit time is capped by
|
||||||
|
the max_batch_time, which defaults to 15000us (15ms). This
|
||||||
|
optimization can be turned off entirely by setting max_batch_time to 0.
|
||||||
|
|
||||||
|
min_batch_time=usec
|
||||||
|
This parameter sets the commit time (as described above) to be at least
|
||||||
|
min_batch_time. It defaults to zero microseconds. Increasing this
|
||||||
|
parameter may improve the throughput of multi-threaded, synchronous
|
||||||
|
workloads on very fast disks, at the cost of increasing latency.
|
||||||
|
|
||||||
|
journal_ioprio=prio
|
||||||
|
The I/O priority (from 0 to 7, where 0 is the highest priority) which
|
||||||
|
should be used for I/O operations submitted by kjournald2 during a
|
||||||
|
commit operation. This defaults to 3, which is a slightly higher
|
||||||
|
priority than the default I/O priority.
|
||||||
|
|
||||||
|
auto_da_alloc(*), noauto_da_alloc
|
||||||
|
Many broken applications don't use fsync() when replacing existing
|
||||||
|
files via patterns such as fd = open("foo.new")/write(fd,..)/close(fd)/
|
||||||
|
rename("foo.new", "foo"), or worse yet, fd = open("foo",
|
||||||
|
O_TRUNC)/write(fd,..)/close(fd). If auto_da_alloc is enabled, ext4
|
||||||
|
will detect the replace-via-rename and replace-via-truncate patterns
|
||||||
|
and force that any delayed allocation blocks are allocated such that at
|
||||||
|
the next journal commit, in the default data=ordered mode, the data
|
||||||
|
blocks of the new file are forced to disk before the rename() operation
|
||||||
|
is committed. This provides roughly the same level of guarantees as
|
||||||
|
ext3, and avoids the "zero-length" problem that can happen when a
|
||||||
|
system crashes before the delayed allocation blocks are forced to disk.
|
||||||
|
|
||||||
|
noinit_itable
|
||||||
|
Do not initialize any uninitialized inode table blocks in the
|
||||||
|
background. This feature may be used by installation CD's so that the
|
||||||
|
install process can complete as quickly as possible; the inode table
|
||||||
|
initialization process would then be deferred until the next time the
|
||||||
|
file system is unmounted.
|
||||||
|
|
||||||
|
init_itable=n
|
||||||
|
The lazy itable init code will wait n times the number of milliseconds
|
||||||
|
it took to zero out the previous block group's inode table. This
|
||||||
|
minimizes the impact on the system performance while file system's
|
||||||
|
inode table is being initialized.
|
||||||
|
|
||||||
|
discard, nodiscard(*)
|
||||||
|
Controls whether ext4 should issue discard/TRIM commands to the
|
||||||
|
underlying block device when blocks are freed. This is useful for SSD
|
||||||
|
devices and sparse/thinly-provisioned LUNs, but it is off by default
|
||||||
|
until sufficient testing has been done.
|
||||||
|
|
||||||
|
nouid32
|
||||||
|
Disables 32-bit UIDs and GIDs. This is for interoperability with
|
||||||
|
older kernels which only store and expect 16-bit values.
|
||||||
|
|
||||||
|
block_validity(*), noblock_validity
|
||||||
|
These options enable or disable the in-kernel facility for tracking
|
||||||
|
filesystem metadata blocks within internal data structures. This
|
||||||
|
allows multi- block allocator and other routines to notice bugs or
|
||||||
|
corrupted allocation bitmaps which cause blocks to be allocated which
|
||||||
|
overlap with filesystem metadata blocks.
|
||||||
|
|
||||||
|
dioread_lock, dioread_nolock
|
||||||
|
Controls whether or not ext4 should use the DIO read locking. If the
|
||||||
|
dioread_nolock option is specified ext4 will allocate uninitialized
|
||||||
|
extent before buffer write and convert the extent to initialized after
|
||||||
|
IO completes. This approach allows ext4 code to avoid using inode
|
||||||
|
mutex, which improves scalability on high speed storages. However this
|
||||||
|
does not work with data journaling and dioread_nolock option will be
|
||||||
|
ignored with kernel warning. Note that dioread_nolock code path is only
|
||||||
|
used for extent-based files. Because of the restrictions this options
|
||||||
|
comprises it is off by default (e.g. dioread_lock).
|
||||||
|
|
||||||
|
max_dir_size_kb=n
|
||||||
|
This limits the size of directories so that any attempt to expand them
|
||||||
|
beyond the specified limit in kilobytes will cause an ENOSPC error.
|
||||||
|
This is useful in memory constrained environments, where a very large
|
||||||
|
directory can cause severe performance problems or even provoke the Out
|
||||||
|
Of Memory killer. (For example, if there is only 512mb memory
|
||||||
|
available, a 176mb directory may seriously cramp the system's style.)
|
||||||
|
|
||||||
|
i_version
|
||||||
|
Enable 64-bit inode version support. This option is off by default.
|
||||||
|
|
||||||
|
dax
|
||||||
|
Use direct access (no page cache). See
|
||||||
|
Documentation/filesystems/dax.txt. Note that this option is
|
||||||
|
incompatible with data=journal.
|
||||||
|
|
||||||
|
Data Mode
|
||||||
|
=========
|
||||||
|
There are 3 different data modes:
|
||||||
|
|
||||||
|
* writeback mode
|
||||||
|
|
||||||
|
In data=writeback mode, ext4 does not journal data at all. This mode provides
|
||||||
|
a similar level of journaling as that of XFS, JFS, and ReiserFS in its default
|
||||||
|
mode - metadata journaling. A crash+recovery can cause incorrect data to
|
||||||
|
appear in files which were written shortly before the crash. This mode will
|
||||||
|
typically provide the best ext4 performance.
|
||||||
|
|
||||||
|
* ordered mode
|
||||||
|
|
||||||
|
In data=ordered mode, ext4 only officially journals metadata, but it logically
|
||||||
|
groups metadata information related to data changes with the data blocks into
|
||||||
|
a single unit called a transaction. When it's time to write the new metadata
|
||||||
|
out to disk, the associated data blocks are written first. In general, this
|
||||||
|
mode performs slightly slower than writeback but significantly faster than
|
||||||
|
journal mode.
|
||||||
|
|
||||||
|
* journal mode
|
||||||
|
|
||||||
|
data=journal mode provides full data and metadata journaling. All new data is
|
||||||
|
written to the journal first, and then to its final location. In the event of
|
||||||
|
a crash, the journal can be replayed, bringing both data and metadata into a
|
||||||
|
consistent state. This mode is the slowest except when data needs to be read
|
||||||
|
from and written to disk at the same time where it outperforms all others
|
||||||
|
modes. Enabling this mode will disable delayed allocation and O_DIRECT
|
||||||
|
support.
|
||||||
|
|
||||||
|
/proc entries
|
||||||
|
=============
|
||||||
|
|
||||||
|
Information about mounted ext4 file systems can be found in
|
||||||
|
/proc/fs/ext4. Each mounted filesystem will have a directory in
|
||||||
|
/proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or
|
||||||
|
/proc/fs/ext4/dm-0). The files in each per-device directory are shown
|
||||||
|
in table below.
|
||||||
|
|
||||||
|
Files in /proc/fs/ext4/<devname>
|
||||||
|
|
||||||
|
mb_groups
|
||||||
|
details of multiblock allocator buddy cache of free blocks
|
||||||
|
|
||||||
|
/sys entries
|
||||||
|
============
|
||||||
|
|
||||||
|
Information about mounted ext4 file systems can be found in
|
||||||
|
/sys/fs/ext4. Each mounted filesystem will have a directory in
|
||||||
|
/sys/fs/ext4 based on its device name (i.e., /sys/fs/ext4/hdc or
|
||||||
|
/sys/fs/ext4/dm-0). The files in each per-device directory are shown
|
||||||
|
in table below.
|
||||||
|
|
||||||
|
Files in /sys/fs/ext4/<devname>:
|
||||||
|
|
||||||
|
(see also Documentation/ABI/testing/sysfs-fs-ext4)
|
||||||
|
|
||||||
|
delayed_allocation_blocks
|
||||||
|
This file is read-only and shows the number of blocks that are dirty in
|
||||||
|
the page cache, but which do not have their location in the filesystem
|
||||||
|
allocated yet.
|
||||||
|
|
||||||
|
inode_goal
|
||||||
|
Tuning parameter which (if non-zero) controls the goal inode used by
|
||||||
|
the inode allocator in preference to all other allocation heuristics.
|
||||||
|
This is intended for debugging use only, and should be 0 on production
|
||||||
|
systems.
|
||||||
|
|
||||||
|
inode_readahead_blks
|
||||||
|
Tuning parameter which controls the maximum number of inode table
|
||||||
|
blocks that ext4's inode table readahead algorithm will pre-read into
|
||||||
|
the buffer cache.
|
||||||
|
|
||||||
|
lifetime_write_kbytes
|
||||||
|
This file is read-only and shows the number of kilobytes of data that
|
||||||
|
have been written to this filesystem since it was created.
|
||||||
|
|
||||||
|
max_writeback_mb_bump
|
||||||
|
The maximum number of megabytes the writeback code will try to write
|
||||||
|
out before move on to another inode.
|
||||||
|
|
||||||
|
mb_group_prealloc
|
||||||
|
The multiblock allocator will round up allocation requests to a
|
||||||
|
multiple of this tuning parameter if the stripe size is not set in the
|
||||||
|
ext4 superblock
|
||||||
|
|
||||||
|
mb_max_to_scan
|
||||||
|
The maximum number of extents the multiblock allocator will search to
|
||||||
|
find the best extent.
|
||||||
|
|
||||||
|
mb_min_to_scan
|
||||||
|
The minimum number of extents the multiblock allocator will search to
|
||||||
|
find the best extent.
|
||||||
|
|
||||||
|
mb_order2_req
|
||||||
|
Tuning parameter which controls the minimum size for requests (as a
|
||||||
|
power of 2) where the buddy cache is used.
|
||||||
|
|
||||||
|
mb_stats
|
||||||
|
Controls whether the multiblock allocator should collect statistics,
|
||||||
|
which are shown during the unmount. 1 means to collect statistics, 0
|
||||||
|
means not to collect statistics.
|
||||||
|
|
||||||
|
mb_stream_req
|
||||||
|
Files which have fewer blocks than this tunable parameter will have
|
||||||
|
their blocks allocated out of a block group specific preallocation
|
||||||
|
pool, so that small files are packed closely together. Each large file
|
||||||
|
will have its blocks allocated out of its own unique preallocation
|
||||||
|
pool.
|
||||||
|
|
||||||
|
session_write_kbytes
|
||||||
|
This file is read-only and shows the number of kilobytes of data that
|
||||||
|
have been written to this filesystem since it was mounted.
|
||||||
|
|
||||||
|
reserved_clusters
|
||||||
|
This is RW file and contains number of reserved clusters in the file
|
||||||
|
system which will be used in the specific situations to avoid costly
|
||||||
|
zeroout, unexpected ENOSPC, or possible data loss. The default is 2% or
|
||||||
|
4096 clusters, whichever is smaller and this can be changed however it
|
||||||
|
can never exceed number of clusters in the file system. If there is not
|
||||||
|
enough space for the reserved space when mounting the file mount will
|
||||||
|
_not_ fail.
|
||||||
|
|
||||||
|
Ioctls
|
||||||
|
======
|
||||||
|
|
||||||
|
There is some Ext4 specific functionality which can be accessed by applications
|
||||||
|
through the system call interfaces. The list of all Ext4 specific ioctls are
|
||||||
|
shown in the table below.
|
||||||
|
|
||||||
|
Table of Ext4 specific ioctls
|
||||||
|
|
||||||
|
EXT4_IOC_GETFLAGS
|
||||||
|
Get additional attributes associated with inode. The ioctl argument is
|
||||||
|
an integer bitfield, with bit values described in ext4.h. This ioctl is
|
||||||
|
an alias for FS_IOC_GETFLAGS.
|
||||||
|
|
||||||
|
EXT4_IOC_SETFLAGS
|
||||||
|
Set additional attributes associated with inode. The ioctl argument is
|
||||||
|
an integer bitfield, with bit values described in ext4.h. This ioctl is
|
||||||
|
an alias for FS_IOC_SETFLAGS.
|
||||||
|
|
||||||
|
EXT4_IOC_GETVERSION, EXT4_IOC_GETVERSION_OLD
|
||||||
|
Get the inode i_generation number stored for each inode. The
|
||||||
|
i_generation number is normally changed only when new inode is created
|
||||||
|
and it is particularly useful for network filesystems. The '_OLD'
|
||||||
|
version of this ioctl is an alias for FS_IOC_GETVERSION.
|
||||||
|
|
||||||
|
EXT4_IOC_SETVERSION, EXT4_IOC_SETVERSION_OLD
|
||||||
|
Set the inode i_generation number stored for each inode. The '_OLD'
|
||||||
|
version of this ioctl is an alias for FS_IOC_SETVERSION.
|
||||||
|
|
||||||
|
EXT4_IOC_GROUP_EXTEND
|
||||||
|
This ioctl has the same purpose as the resize mount option. It allows
|
||||||
|
to resize filesystem to the end of the last existing block group,
|
||||||
|
further resize has to be done with resize2fs, either online, or
|
||||||
|
offline. The argument points to the unsigned logn number representing
|
||||||
|
the filesystem new block count.
|
||||||
|
|
||||||
|
EXT4_IOC_MOVE_EXT
|
||||||
|
Move the block extents from orig_fd (the one this ioctl is pointing to)
|
||||||
|
to the donor_fd (the one specified in move_extent structure passed as
|
||||||
|
an argument to this ioctl). Then, exchange inode metadata between
|
||||||
|
orig_fd and donor_fd. This is especially useful for online
|
||||||
|
defragmentation, because the allocator has the opportunity to allocate
|
||||||
|
moved blocks better, ideally into one contiguous extent.
|
||||||
|
|
||||||
|
EXT4_IOC_GROUP_ADD
|
||||||
|
Add a new group descriptor to an existing or new group descriptor
|
||||||
|
block. The new group descriptor is described by ext4_new_group_input
|
||||||
|
structure, which is passed as an argument to this ioctl. This is
|
||||||
|
especially useful in conjunction with EXT4_IOC_GROUP_EXTEND, which
|
||||||
|
allows online resize of the filesystem to the end of the last existing
|
||||||
|
block group. Those two ioctls combined is used in userspace online
|
||||||
|
resize tool (e.g. resize2fs).
|
||||||
|
|
||||||
|
EXT4_IOC_MIGRATE
|
||||||
|
This ioctl operates on the filesystem itself. It converts (migrates)
|
||||||
|
ext3 indirect block mapped inode to ext4 extent mapped inode by walking
|
||||||
|
through indirect block mapping of the original inode and converting
|
||||||
|
contiguous block ranges into ext4 extents of the temporary inode. Then,
|
||||||
|
inodes are swapped. This ioctl might help, when migrating from ext3 to
|
||||||
|
ext4 filesystem, however suggestion is to create fresh ext4 filesystem
|
||||||
|
and copy data from the backup. Note, that filesystem has to support
|
||||||
|
extents for this ioctl to work.
|
||||||
|
|
||||||
|
EXT4_IOC_ALLOC_DA_BLKS
|
||||||
|
Force all of the delay allocated blocks to be allocated to preserve
|
||||||
|
application-expected ext3 behaviour. Note that this will also start
|
||||||
|
triggering a write of the data blocks, but this behaviour may change in
|
||||||
|
the future as it is not necessary and has been done this way only for
|
||||||
|
sake of simplicity.
|
||||||
|
|
||||||
|
EXT4_IOC_RESIZE_FS
|
||||||
|
Resize the filesystem to a new size. The number of blocks of resized
|
||||||
|
filesystem is passed in via 64 bit integer argument. The kernel
|
||||||
|
allocates bitmaps and inode table, the userspace tool thus just passes
|
||||||
|
the new number of blocks.
|
||||||
|
|
||||||
|
EXT4_IOC_SWAP_BOOT
|
||||||
|
Swap i_blocks and associated attributes (like i_blocks, i_size,
|
||||||
|
i_flags, ...) from the specified inode with inode EXT4_BOOT_LOADER_INO
|
||||||
|
(#5). This is typically used to store a boot loader in a secure part of
|
||||||
|
the filesystem, where it can't be changed by a normal user by accident.
|
||||||
|
The data blocks of the previous boot loader will be associated with the
|
||||||
|
given inode.
|
||||||
|
|
||||||
|
References
|
||||||
|
==========
|
||||||
|
|
||||||
|
kernel source: <file:fs/ext4/>
|
||||||
|
<file:fs/jbd2/>
|
||||||
|
|
||||||
|
programs: http://e2fsprogs.sourceforge.net/
|
||||||
|
|
||||||
|
useful links: http://fedoraproject.org/wiki/ext3-devel
|
||||||
|
http://www.bullopensource.org/ext4/
|
||||||
|
http://ext4.wiki.kernel.org/index.php/Main_Page
|
||||||
|
http://fedoraproject.org/wiki/Features/Ext4
|
@ -71,6 +71,7 @@ configure specific aspects of kernel behavior to your liking.
|
|||||||
java
|
java
|
||||||
ras
|
ras
|
||||||
bcache
|
bcache
|
||||||
|
ext4
|
||||||
pm/index
|
pm/index
|
||||||
thunderbolt
|
thunderbolt
|
||||||
LSM/index
|
LSM/index
|
||||||
|
@ -383,6 +383,10 @@ latex_documents = [
|
|||||||
'The kernel development community', 'manual'),
|
'The kernel development community', 'manual'),
|
||||||
('filesystems/index', 'filesystems.tex', 'Linux Filesystems API',
|
('filesystems/index', 'filesystems.tex', 'Linux Filesystems API',
|
||||||
'The kernel development community', 'manual'),
|
'The kernel development community', 'manual'),
|
||||||
|
('admin-guide/ext4', 'ext4-admin-guide.tex', 'ext4 Administration Guide',
|
||||||
|
'ext4 Community', 'manual'),
|
||||||
|
('filesystems/ext4/index', 'ext4-data-structures.tex',
|
||||||
|
'ext4 Data Structures and Algorithms', 'ext4 Community', 'manual'),
|
||||||
('gpu/index', 'gpu.tex', 'Linux GPU Driver Developer\'s Guide',
|
('gpu/index', 'gpu.tex', 'Linux GPU Driver Developer\'s Guide',
|
||||||
'The kernel development community', 'manual'),
|
'The kernel development community', 'manual'),
|
||||||
('input/index', 'linux-input.tex', 'The Linux input driver subsystem',
|
('input/index', 'linux-input.tex', 'The Linux input driver subsystem',
|
||||||
|
@ -30,7 +30,7 @@ Extended attributes, when stored after the inode, have a header
|
|||||||
``ext4_xattr_ibody_header`` that is 4 bytes long:
|
``ext4_xattr_ibody_header`` that is 4 bytes long:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
||||||
@ -47,7 +47,7 @@ The beginning of an extended attribute block is in
|
|||||||
``struct ext4_xattr_header``, which is 32 bytes long:
|
``struct ext4_xattr_header``, which is 32 bytes long:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
||||||
@ -92,7 +92,7 @@ entries must be stored in sorted order. The sort order is
|
|||||||
Attributes stored inside an inode do not need be stored in sorted order.
|
Attributes stored inside an inode do not need be stored in sorted order.
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
||||||
@ -157,7 +157,7 @@ attribute name index field is set, and matching string is removed from
|
|||||||
the key name. Here is a map of name index values to key prefixes:
|
the key name. Here is a map of name index values to key prefixes:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 79
|
:widths: 16 64
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Name Index
|
* - Name Index
|
@ -28,7 +28,7 @@ of checksum. The checksum function is whatever the superblock describes
|
|||||||
(crc32c as of October 2013) unless noted otherwise.
|
(crc32c as of October 2013) unless noted otherwise.
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 4
|
:widths: 20 8 50
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Metadata
|
* - Metadata
|
@ -34,7 +34,7 @@ is at most 263 bytes long, though on disk you'll need to reference
|
|||||||
``dirent.rec_len`` to know for sure.
|
``dirent.rec_len`` to know for sure.
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
||||||
@ -66,7 +66,7 @@ tree traversal. This format is ``ext4_dir_entry_2``, which is at most
|
|||||||
``dirent.rec_len`` to know for sure.
|
``dirent.rec_len`` to know for sure.
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
||||||
@ -99,7 +99,7 @@ tree traversal. This format is ``ext4_dir_entry_2``, which is at most
|
|||||||
The directory file type is one of the following values:
|
The directory file type is one of the following values:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 79
|
:widths: 16 64
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Value
|
* - Value
|
||||||
@ -130,7 +130,7 @@ in the place where the name normally goes. The structure is
|
|||||||
``struct ext4_dir_entry_tail``:
|
``struct ext4_dir_entry_tail``:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
||||||
@ -212,7 +212,7 @@ The root of the htree is in ``struct dx_root``, which is the full length
|
|||||||
of a data block:
|
of a data block:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
||||||
@ -305,7 +305,7 @@ of a data block:
|
|||||||
The directory hash is one of the following values:
|
The directory hash is one of the following values:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 79
|
:widths: 16 64
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Value
|
* - Value
|
||||||
@ -327,7 +327,7 @@ Interior nodes of an htree are recorded as ``struct dx_node``, which is
|
|||||||
also the full length of a data block:
|
also the full length of a data block:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
||||||
@ -375,7 +375,7 @@ The hash maps that exist in both ``struct dx_root`` and
|
|||||||
long:
|
long:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
||||||
@ -405,7 +405,7 @@ directory index (which will ensure that there's space for the checksum.
|
|||||||
The dx\_tail structure is 8 bytes long and looks like this:
|
The dx\_tail structure is 8 bytes long and looks like this:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
@ -1,613 +0,0 @@
|
|||||||
.. SPDX-License-Identifier: GPL-2.0
|
|
||||||
|
|
||||||
========================
|
|
||||||
General Information
|
|
||||||
========================
|
|
||||||
|
|
||||||
Ext4 is an advanced level of the ext3 filesystem which incorporates
|
|
||||||
scalability and reliability enhancements for supporting large filesystems
|
|
||||||
(64 bit) in keeping with increasing disk capacities and state-of-the-art
|
|
||||||
feature requirements.
|
|
||||||
|
|
||||||
Mailing list: linux-ext4@vger.kernel.org
|
|
||||||
Web site: http://ext4.wiki.kernel.org
|
|
||||||
|
|
||||||
|
|
||||||
Quick usage instructions
|
|
||||||
========================
|
|
||||||
|
|
||||||
Note: More extensive information for getting started with ext4 can be
|
|
||||||
found at the ext4 wiki site at the URL:
|
|
||||||
http://ext4.wiki.kernel.org/index.php/Ext4_Howto
|
|
||||||
|
|
||||||
- The latest version of e2fsprogs can be found at:
|
|
||||||
|
|
||||||
https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/
|
|
||||||
|
|
||||||
or
|
|
||||||
|
|
||||||
http://sourceforge.net/project/showfiles.php?group_id=2406
|
|
||||||
|
|
||||||
or grab the latest git repository from:
|
|
||||||
|
|
||||||
https://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git
|
|
||||||
|
|
||||||
- Create a new filesystem using the ext4 filesystem type:
|
|
||||||
|
|
||||||
# mke2fs -t ext4 /dev/hda1
|
|
||||||
|
|
||||||
Or to configure an existing ext3 filesystem to support extents:
|
|
||||||
|
|
||||||
# tune2fs -O extents /dev/hda1
|
|
||||||
|
|
||||||
If the filesystem was created with 128 byte inodes, it can be
|
|
||||||
converted to use 256 byte for greater efficiency via:
|
|
||||||
|
|
||||||
# tune2fs -I 256 /dev/hda1
|
|
||||||
|
|
||||||
- Mounting:
|
|
||||||
|
|
||||||
# mount -t ext4 /dev/hda1 /wherever
|
|
||||||
|
|
||||||
- When comparing performance with other filesystems, it's always
|
|
||||||
important to try multiple workloads; very often a subtle change in a
|
|
||||||
workload parameter can completely change the ranking of which
|
|
||||||
filesystems do well compared to others. When comparing versus ext3,
|
|
||||||
note that ext4 enables write barriers by default, while ext3 does
|
|
||||||
not enable write barriers by default. So it is useful to use
|
|
||||||
explicitly specify whether barriers are enabled or not when via the
|
|
||||||
'-o barriers=[0|1]' mount option for both ext3 and ext4 filesystems
|
|
||||||
for a fair comparison. When tuning ext3 for best benchmark numbers,
|
|
||||||
it is often worthwhile to try changing the data journaling mode; '-o
|
|
||||||
data=writeback' can be faster for some workloads. (Note however that
|
|
||||||
running mounted with data=writeback can potentially leave stale data
|
|
||||||
exposed in recently written files in case of an unclean shutdown,
|
|
||||||
which could be a security exposure in some situations.) Configuring
|
|
||||||
the filesystem with a large journal can also be helpful for
|
|
||||||
metadata-intensive workloads.
|
|
||||||
|
|
||||||
Features
|
|
||||||
========
|
|
||||||
|
|
||||||
Currently Available
|
|
||||||
-------------------
|
|
||||||
|
|
||||||
* ability to use filesystems > 16TB (e2fsprogs support not available yet)
|
|
||||||
* extent format reduces metadata overhead (RAM, IO for access, transactions)
|
|
||||||
* extent format more robust in face of on-disk corruption due to magics,
|
|
||||||
* internal redundancy in tree
|
|
||||||
* improved file allocation (multi-block alloc)
|
|
||||||
* lift 32000 subdirectory limit imposed by i_links_count[1]
|
|
||||||
* nsec timestamps for mtime, atime, ctime, create time
|
|
||||||
* inode version field on disk (NFSv4, Lustre)
|
|
||||||
* reduced e2fsck time via uninit_bg feature
|
|
||||||
* journal checksumming for robustness, performance
|
|
||||||
* persistent file preallocation (e.g for streaming media, databases)
|
|
||||||
* ability to pack bitmaps and inode tables into larger virtual groups via the
|
|
||||||
flex_bg feature
|
|
||||||
* large file support
|
|
||||||
* inode allocation using large virtual block groups via flex_bg
|
|
||||||
* delayed allocation
|
|
||||||
* large block (up to pagesize) support
|
|
||||||
* efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force
|
|
||||||
the ordering)
|
|
||||||
|
|
||||||
[1] Filesystems with a block size of 1k may see a limit imposed by the
|
|
||||||
directory hash tree having a maximum depth of two.
|
|
||||||
|
|
||||||
Options
|
|
||||||
=======
|
|
||||||
|
|
||||||
When mounting an ext4 filesystem, the following option are accepted:
|
|
||||||
(*) == default
|
|
||||||
|
|
||||||
======================= =======================================================
|
|
||||||
Mount Option Description
|
|
||||||
======================= =======================================================
|
|
||||||
ro Mount filesystem read only. Note that ext4 will
|
|
||||||
replay the journal (and thus write to the
|
|
||||||
partition) even when mounted "read only". The
|
|
||||||
mount options "ro,noload" can be used to prevent
|
|
||||||
writes to the filesystem.
|
|
||||||
|
|
||||||
journal_checksum Enable checksumming of the journal transactions.
|
|
||||||
This will allow the recovery code in e2fsck and the
|
|
||||||
kernel to detect corruption in the kernel. It is a
|
|
||||||
compatible change and will be ignored by older kernels.
|
|
||||||
|
|
||||||
journal_async_commit Commit block can be written to disk without waiting
|
|
||||||
for descriptor blocks. If enabled older kernels cannot
|
|
||||||
mount the device. This will enable 'journal_checksum'
|
|
||||||
internally.
|
|
||||||
|
|
||||||
journal_path=path
|
|
||||||
journal_dev=devnum When the external journal device's major/minor numbers
|
|
||||||
have changed, these options allow the user to specify
|
|
||||||
the new journal location. The journal device is
|
|
||||||
identified through either its new major/minor numbers
|
|
||||||
encoded in devnum, or via a path to the device.
|
|
||||||
|
|
||||||
norecovery Don't load the journal on mounting. Note that
|
|
||||||
noload if the filesystem was not unmounted cleanly,
|
|
||||||
skipping the journal replay will lead to the
|
|
||||||
filesystem containing inconsistencies that can
|
|
||||||
lead to any number of problems.
|
|
||||||
|
|
||||||
data=journal All data are committed into the journal prior to being
|
|
||||||
written into the main file system. Enabling
|
|
||||||
this mode will disable delayed allocation and
|
|
||||||
O_DIRECT support.
|
|
||||||
|
|
||||||
data=ordered (*) All data are forced directly out to the main file
|
|
||||||
system prior to its metadata being committed to the
|
|
||||||
journal.
|
|
||||||
|
|
||||||
data=writeback Data ordering is not preserved, data may be written
|
|
||||||
into the main file system after its metadata has been
|
|
||||||
committed to the journal.
|
|
||||||
|
|
||||||
commit=nrsec (*) Ext4 can be told to sync all its data and metadata
|
|
||||||
every 'nrsec' seconds. The default value is 5 seconds.
|
|
||||||
This means that if you lose your power, you will lose
|
|
||||||
as much as the latest 5 seconds of work (your
|
|
||||||
filesystem will not be damaged though, thanks to the
|
|
||||||
journaling). This default value (or any low value)
|
|
||||||
will hurt performance, but it's good for data-safety.
|
|
||||||
Setting it to 0 will have the same effect as leaving
|
|
||||||
it at the default (5 seconds).
|
|
||||||
Setting it to very large values will improve
|
|
||||||
performance.
|
|
||||||
|
|
||||||
barrier=<0|1(*)> This enables/disables the use of write barriers in
|
|
||||||
barrier(*) the jbd code. barrier=0 disables, barrier=1 enables.
|
|
||||||
nobarrier This also requires an IO stack which can support
|
|
||||||
barriers, and if jbd gets an error on a barrier
|
|
||||||
write, it will disable again with a warning.
|
|
||||||
Write barriers enforce proper on-disk ordering
|
|
||||||
of journal commits, making volatile disk write caches
|
|
||||||
safe to use, at some performance penalty. If
|
|
||||||
your disks are battery-backed in one way or another,
|
|
||||||
disabling barriers may safely improve performance.
|
|
||||||
The mount options "barrier" and "nobarrier" can
|
|
||||||
also be used to enable or disable barriers, for
|
|
||||||
consistency with other ext4 mount options.
|
|
||||||
|
|
||||||
inode_readahead_blks=n This tuning parameter controls the maximum
|
|
||||||
number of inode table blocks that ext4's inode
|
|
||||||
table readahead algorithm will pre-read into
|
|
||||||
the buffer cache. The default value is 32 blocks.
|
|
||||||
|
|
||||||
nouser_xattr Disables Extended User Attributes. See the
|
|
||||||
attr(5) manual page for more information about
|
|
||||||
extended attributes.
|
|
||||||
|
|
||||||
noacl This option disables POSIX Access Control List
|
|
||||||
support. If ACL support is enabled in the kernel
|
|
||||||
configuration (CONFIG_EXT4_FS_POSIX_ACL), ACL is
|
|
||||||
enabled by default on mount. See the acl(5) manual
|
|
||||||
page for more information about acl.
|
|
||||||
|
|
||||||
bsddf (*) Make 'df' act like BSD.
|
|
||||||
minixdf Make 'df' act like Minix.
|
|
||||||
|
|
||||||
debug Extra debugging information is sent to syslog.
|
|
||||||
|
|
||||||
abort Simulate the effects of calling ext4_abort() for
|
|
||||||
debugging purposes. This is normally used while
|
|
||||||
remounting a filesystem which is already mounted.
|
|
||||||
|
|
||||||
errors=remount-ro Remount the filesystem read-only on an error.
|
|
||||||
errors=continue Keep going on a filesystem error.
|
|
||||||
errors=panic Panic and halt the machine if an error occurs.
|
|
||||||
(These mount options override the errors behavior
|
|
||||||
specified in the superblock, which can be configured
|
|
||||||
using tune2fs)
|
|
||||||
|
|
||||||
data_err=ignore(*) Just print an error message if an error occurs
|
|
||||||
in a file data buffer in ordered mode.
|
|
||||||
data_err=abort Abort the journal if an error occurs in a file
|
|
||||||
data buffer in ordered mode.
|
|
||||||
|
|
||||||
grpid New objects have the group ID of their parent.
|
|
||||||
bsdgroups
|
|
||||||
|
|
||||||
nogrpid (*) New objects have the group ID of their creator.
|
|
||||||
sysvgroups
|
|
||||||
|
|
||||||
resgid=n The group ID which may use the reserved blocks.
|
|
||||||
|
|
||||||
resuid=n The user ID which may use the reserved blocks.
|
|
||||||
|
|
||||||
sb=n Use alternate superblock at this location.
|
|
||||||
|
|
||||||
quota These options are ignored by the filesystem. They
|
|
||||||
noquota are used only by quota tools to recognize volumes
|
|
||||||
grpquota where quota should be turned on. See documentation
|
|
||||||
usrquota in the quota-tools package for more details
|
|
||||||
(http://sourceforge.net/projects/linuxquota).
|
|
||||||
|
|
||||||
jqfmt=<quota type> These options tell filesystem details about quota
|
|
||||||
usrjquota=<file> so that quota information can be properly updated
|
|
||||||
grpjquota=<file> during journal replay. They replace the above
|
|
||||||
quota options. See documentation in the quota-tools
|
|
||||||
package for more details
|
|
||||||
(http://sourceforge.net/projects/linuxquota).
|
|
||||||
|
|
||||||
stripe=n Number of filesystem blocks that mballoc will try
|
|
||||||
to use for allocation size and alignment. For RAID5/6
|
|
||||||
systems this should be the number of data
|
|
||||||
disks * RAID chunk size in file system blocks.
|
|
||||||
|
|
||||||
delalloc (*) Defer block allocation until just before ext4
|
|
||||||
writes out the block(s) in question. This
|
|
||||||
allows ext4 to better allocation decisions
|
|
||||||
more efficiently.
|
|
||||||
nodelalloc Disable delayed allocation. Blocks are allocated
|
|
||||||
when the data is copied from userspace to the
|
|
||||||
page cache, either via the write(2) system call
|
|
||||||
or when an mmap'ed page which was previously
|
|
||||||
unallocated is written for the first time.
|
|
||||||
|
|
||||||
max_batch_time=usec Maximum amount of time ext4 should wait for
|
|
||||||
additional filesystem operations to be batch
|
|
||||||
together with a synchronous write operation.
|
|
||||||
Since a synchronous write operation is going to
|
|
||||||
force a commit and then a wait for the I/O
|
|
||||||
complete, it doesn't cost much, and can be a
|
|
||||||
huge throughput win, we wait for a small amount
|
|
||||||
of time to see if any other transactions can
|
|
||||||
piggyback on the synchronous write. The
|
|
||||||
algorithm used is designed to automatically tune
|
|
||||||
for the speed of the disk, by measuring the
|
|
||||||
amount of time (on average) that it takes to
|
|
||||||
finish committing a transaction. Call this time
|
|
||||||
the "commit time". If the time that the
|
|
||||||
transaction has been running is less than the
|
|
||||||
commit time, ext4 will try sleeping for the
|
|
||||||
commit time to see if other operations will join
|
|
||||||
the transaction. The commit time is capped by
|
|
||||||
the max_batch_time, which defaults to 15000us
|
|
||||||
(15ms). This optimization can be turned off
|
|
||||||
entirely by setting max_batch_time to 0.
|
|
||||||
|
|
||||||
min_batch_time=usec This parameter sets the commit time (as
|
|
||||||
described above) to be at least min_batch_time.
|
|
||||||
It defaults to zero microseconds. Increasing
|
|
||||||
this parameter may improve the throughput of
|
|
||||||
multi-threaded, synchronous workloads on very
|
|
||||||
fast disks, at the cost of increasing latency.
|
|
||||||
|
|
||||||
journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the
|
|
||||||
highest priority) which should be used for I/O
|
|
||||||
operations submitted by kjournald2 during a
|
|
||||||
commit operation. This defaults to 3, which is
|
|
||||||
a slightly higher priority than the default I/O
|
|
||||||
priority.
|
|
||||||
|
|
||||||
auto_da_alloc(*) Many broken applications don't use fsync() when
|
|
||||||
noauto_da_alloc replacing existing files via patterns such as
|
|
||||||
fd = open("foo.new")/write(fd,..)/close(fd)/
|
|
||||||
rename("foo.new", "foo"), or worse yet,
|
|
||||||
fd = open("foo", O_TRUNC)/write(fd,..)/close(fd).
|
|
||||||
If auto_da_alloc is enabled, ext4 will detect
|
|
||||||
the replace-via-rename and replace-via-truncate
|
|
||||||
patterns and force that any delayed allocation
|
|
||||||
blocks are allocated such that at the next
|
|
||||||
journal commit, in the default data=ordered
|
|
||||||
mode, the data blocks of the new file are forced
|
|
||||||
to disk before the rename() operation is
|
|
||||||
committed. This provides roughly the same level
|
|
||||||
of guarantees as ext3, and avoids the
|
|
||||||
"zero-length" problem that can happen when a
|
|
||||||
system crashes before the delayed allocation
|
|
||||||
blocks are forced to disk.
|
|
||||||
|
|
||||||
noinit_itable Do not initialize any uninitialized inode table
|
|
||||||
blocks in the background. This feature may be
|
|
||||||
used by installation CD's so that the install
|
|
||||||
process can complete as quickly as possible; the
|
|
||||||
inode table initialization process would then be
|
|
||||||
deferred until the next time the file system
|
|
||||||
is unmounted.
|
|
||||||
|
|
||||||
init_itable=n The lazy itable init code will wait n times the
|
|
||||||
number of milliseconds it took to zero out the
|
|
||||||
previous block group's inode table. This
|
|
||||||
minimizes the impact on the system performance
|
|
||||||
while file system's inode table is being initialized.
|
|
||||||
|
|
||||||
discard Controls whether ext4 should issue discard/TRIM
|
|
||||||
nodiscard(*) commands to the underlying block device when
|
|
||||||
blocks are freed. This is useful for SSD devices
|
|
||||||
and sparse/thinly-provisioned LUNs, but it is off
|
|
||||||
by default until sufficient testing has been done.
|
|
||||||
|
|
||||||
nouid32 Disables 32-bit UIDs and GIDs. This is for
|
|
||||||
interoperability with older kernels which only
|
|
||||||
store and expect 16-bit values.
|
|
||||||
|
|
||||||
block_validity(*) These options enable or disable the in-kernel
|
|
||||||
noblock_validity facility for tracking filesystem metadata blocks
|
|
||||||
within internal data structures. This allows multi-
|
|
||||||
block allocator and other routines to notice
|
|
||||||
bugs or corrupted allocation bitmaps which cause
|
|
||||||
blocks to be allocated which overlap with
|
|
||||||
filesystem metadata blocks.
|
|
||||||
|
|
||||||
dioread_lock Controls whether or not ext4 should use the DIO read
|
|
||||||
dioread_nolock locking. If the dioread_nolock option is specified
|
|
||||||
ext4 will allocate uninitialized extent before buffer
|
|
||||||
write and convert the extent to initialized after IO
|
|
||||||
completes. This approach allows ext4 code to avoid
|
|
||||||
using inode mutex, which improves scalability on high
|
|
||||||
speed storages. However this does not work with
|
|
||||||
data journaling and dioread_nolock option will be
|
|
||||||
ignored with kernel warning. Note that dioread_nolock
|
|
||||||
code path is only used for extent-based files.
|
|
||||||
Because of the restrictions this options comprises
|
|
||||||
it is off by default (e.g. dioread_lock).
|
|
||||||
|
|
||||||
max_dir_size_kb=n This limits the size of directories so that any
|
|
||||||
attempt to expand them beyond the specified
|
|
||||||
limit in kilobytes will cause an ENOSPC error.
|
|
||||||
This is useful in memory constrained
|
|
||||||
environments, where a very large directory can
|
|
||||||
cause severe performance problems or even
|
|
||||||
provoke the Out Of Memory killer. (For example,
|
|
||||||
if there is only 512mb memory available, a 176mb
|
|
||||||
directory may seriously cramp the system's style.)
|
|
||||||
|
|
||||||
i_version Enable 64-bit inode version support. This option is
|
|
||||||
off by default.
|
|
||||||
|
|
||||||
dax Use direct access (no page cache). See
|
|
||||||
Documentation/filesystems/dax.txt. Note that
|
|
||||||
this option is incompatible with data=journal.
|
|
||||||
======================= =======================================================
|
|
||||||
|
|
||||||
Data Mode
|
|
||||||
=========
|
|
||||||
There are 3 different data modes:
|
|
||||||
|
|
||||||
* writeback mode
|
|
||||||
|
|
||||||
In data=writeback mode, ext4 does not journal data at all. This mode provides
|
|
||||||
a similar level of journaling as that of XFS, JFS, and ReiserFS in its default
|
|
||||||
mode - metadata journaling. A crash+recovery can cause incorrect data to
|
|
||||||
appear in files which were written shortly before the crash. This mode will
|
|
||||||
typically provide the best ext4 performance.
|
|
||||||
|
|
||||||
* ordered mode
|
|
||||||
|
|
||||||
In data=ordered mode, ext4 only officially journals metadata, but it logically
|
|
||||||
groups metadata information related to data changes with the data blocks into
|
|
||||||
a single unit called a transaction. When it's time to write the new metadata
|
|
||||||
out to disk, the associated data blocks are written first. In general, this
|
|
||||||
mode performs slightly slower than writeback but significantly faster than
|
|
||||||
journal mode.
|
|
||||||
|
|
||||||
* journal mode
|
|
||||||
|
|
||||||
data=journal mode provides full data and metadata journaling. All new data is
|
|
||||||
written to the journal first, and then to its final location. In the event of
|
|
||||||
a crash, the journal can be replayed, bringing both data and metadata into a
|
|
||||||
consistent state. This mode is the slowest except when data needs to be read
|
|
||||||
from and written to disk at the same time where it outperforms all others
|
|
||||||
modes. Enabling this mode will disable delayed allocation and O_DIRECT
|
|
||||||
support.
|
|
||||||
|
|
||||||
/proc entries
|
|
||||||
=============
|
|
||||||
|
|
||||||
Information about mounted ext4 file systems can be found in
|
|
||||||
/proc/fs/ext4. Each mounted filesystem will have a directory in
|
|
||||||
/proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or
|
|
||||||
/proc/fs/ext4/dm-0). The files in each per-device directory are shown
|
|
||||||
in table below.
|
|
||||||
|
|
||||||
Files in /proc/fs/ext4/<devname>
|
|
||||||
|
|
||||||
================ =======
|
|
||||||
File Content
|
|
||||||
================ =======
|
|
||||||
mb_groups details of multiblock allocator buddy cache of free blocks
|
|
||||||
================ =======
|
|
||||||
|
|
||||||
/sys entries
|
|
||||||
============
|
|
||||||
|
|
||||||
Information about mounted ext4 file systems can be found in
|
|
||||||
/sys/fs/ext4. Each mounted filesystem will have a directory in
|
|
||||||
/sys/fs/ext4 based on its device name (i.e., /sys/fs/ext4/hdc or
|
|
||||||
/sys/fs/ext4/dm-0). The files in each per-device directory are shown
|
|
||||||
in table below.
|
|
||||||
|
|
||||||
Files in /sys/fs/ext4/<devname>:
|
|
||||||
|
|
||||||
(see also Documentation/ABI/testing/sysfs-fs-ext4)
|
|
||||||
|
|
||||||
============================= =================================================
|
|
||||||
File Content
|
|
||||||
============================= =================================================
|
|
||||||
delayed_allocation_blocks This file is read-only and shows the number of
|
|
||||||
blocks that are dirty in the page cache, but
|
|
||||||
which do not have their location in the
|
|
||||||
filesystem allocated yet.
|
|
||||||
|
|
||||||
inode_goal Tuning parameter which (if non-zero) controls
|
|
||||||
the goal inode used by the inode allocator in
|
|
||||||
preference to all other allocation heuristics.
|
|
||||||
This is intended for debugging use only, and
|
|
||||||
should be 0 on production systems.
|
|
||||||
|
|
||||||
inode_readahead_blks Tuning parameter which controls the maximum
|
|
||||||
number of inode table blocks that ext4's inode
|
|
||||||
table readahead algorithm will pre-read into
|
|
||||||
the buffer cache
|
|
||||||
|
|
||||||
lifetime_write_kbytes This file is read-only and shows the number of
|
|
||||||
kilobytes of data that have been written to this
|
|
||||||
filesystem since it was created.
|
|
||||||
|
|
||||||
max_writeback_mb_bump The maximum number of megabytes the writeback
|
|
||||||
code will try to write out before move on to
|
|
||||||
another inode.
|
|
||||||
|
|
||||||
mb_group_prealloc The multiblock allocator will round up allocation
|
|
||||||
requests to a multiple of this tuning parameter if
|
|
||||||
the stripe size is not set in the ext4 superblock
|
|
||||||
|
|
||||||
mb_max_to_scan The maximum number of extents the multiblock
|
|
||||||
allocator will search to find the best extent
|
|
||||||
|
|
||||||
mb_min_to_scan The minimum number of extents the multiblock
|
|
||||||
allocator will search to find the best extent
|
|
||||||
|
|
||||||
mb_order2_req Tuning parameter which controls the minimum size
|
|
||||||
for requests (as a power of 2) where the buddy
|
|
||||||
cache is used
|
|
||||||
|
|
||||||
mb_stats Controls whether the multiblock allocator should
|
|
||||||
collect statistics, which are shown during the
|
|
||||||
unmount. 1 means to collect statistics, 0 means
|
|
||||||
not to collect statistics
|
|
||||||
|
|
||||||
mb_stream_req Files which have fewer blocks than this tunable
|
|
||||||
parameter will have their blocks allocated out
|
|
||||||
of a block group specific preallocation pool, so
|
|
||||||
that small files are packed closely together.
|
|
||||||
Each large file will have its blocks allocated
|
|
||||||
out of its own unique preallocation pool.
|
|
||||||
|
|
||||||
session_write_kbytes This file is read-only and shows the number of
|
|
||||||
kilobytes of data that have been written to this
|
|
||||||
filesystem since it was mounted.
|
|
||||||
|
|
||||||
reserved_clusters This is RW file and contains number of reserved
|
|
||||||
clusters in the file system which will be used
|
|
||||||
in the specific situations to avoid costly
|
|
||||||
zeroout, unexpected ENOSPC, or possible data
|
|
||||||
loss. The default is 2% or 4096 clusters,
|
|
||||||
whichever is smaller and this can be changed
|
|
||||||
however it can never exceed number of clusters
|
|
||||||
in the file system. If there is not enough space
|
|
||||||
for the reserved space when mounting the file
|
|
||||||
mount will _not_ fail.
|
|
||||||
============================= =================================================
|
|
||||||
|
|
||||||
Ioctls
|
|
||||||
======
|
|
||||||
|
|
||||||
There is some Ext4 specific functionality which can be accessed by applications
|
|
||||||
through the system call interfaces. The list of all Ext4 specific ioctls are
|
|
||||||
shown in the table below.
|
|
||||||
|
|
||||||
Table of Ext4 specific ioctls
|
|
||||||
|
|
||||||
============================= =================================================
|
|
||||||
Ioctl Description
|
|
||||||
============================= =================================================
|
|
||||||
EXT4_IOC_GETFLAGS Get additional attributes associated with inode.
|
|
||||||
The ioctl argument is an integer bitfield, with
|
|
||||||
bit values described in ext4.h. This ioctl is an
|
|
||||||
alias for FS_IOC_GETFLAGS.
|
|
||||||
|
|
||||||
EXT4_IOC_SETFLAGS Set additional attributes associated with inode.
|
|
||||||
The ioctl argument is an integer bitfield, with
|
|
||||||
bit values described in ext4.h. This ioctl is an
|
|
||||||
alias for FS_IOC_SETFLAGS.
|
|
||||||
|
|
||||||
EXT4_IOC_GETVERSION
|
|
||||||
EXT4_IOC_GETVERSION_OLD
|
|
||||||
Get the inode i_generation number stored for
|
|
||||||
each inode. The i_generation number is normally
|
|
||||||
changed only when new inode is created and it is
|
|
||||||
particularly useful for network filesystems. The
|
|
||||||
'_OLD' version of this ioctl is an alias for
|
|
||||||
FS_IOC_GETVERSION.
|
|
||||||
|
|
||||||
EXT4_IOC_SETVERSION
|
|
||||||
EXT4_IOC_SETVERSION_OLD
|
|
||||||
Set the inode i_generation number stored for
|
|
||||||
each inode. The '_OLD' version of this ioctl
|
|
||||||
is an alias for FS_IOC_SETVERSION.
|
|
||||||
|
|
||||||
EXT4_IOC_GROUP_EXTEND This ioctl has the same purpose as the resize
|
|
||||||
mount option. It allows to resize filesystem
|
|
||||||
to the end of the last existing block group,
|
|
||||||
further resize has to be done with resize2fs,
|
|
||||||
either online, or offline. The argument points
|
|
||||||
to the unsigned logn number representing the
|
|
||||||
filesystem new block count.
|
|
||||||
|
|
||||||
EXT4_IOC_MOVE_EXT Move the block extents from orig_fd (the one
|
|
||||||
this ioctl is pointing to) to the donor_fd (the
|
|
||||||
one specified in move_extent structure passed
|
|
||||||
as an argument to this ioctl). Then, exchange
|
|
||||||
inode metadata between orig_fd and donor_fd.
|
|
||||||
This is especially useful for online
|
|
||||||
defragmentation, because the allocator has the
|
|
||||||
opportunity to allocate moved blocks better,
|
|
||||||
ideally into one contiguous extent.
|
|
||||||
|
|
||||||
EXT4_IOC_GROUP_ADD Add a new group descriptor to an existing or
|
|
||||||
new group descriptor block. The new group
|
|
||||||
descriptor is described by ext4_new_group_input
|
|
||||||
structure, which is passed as an argument to
|
|
||||||
this ioctl. This is especially useful in
|
|
||||||
conjunction with EXT4_IOC_GROUP_EXTEND,
|
|
||||||
which allows online resize of the filesystem
|
|
||||||
to the end of the last existing block group.
|
|
||||||
Those two ioctls combined is used in userspace
|
|
||||||
online resize tool (e.g. resize2fs).
|
|
||||||
|
|
||||||
EXT4_IOC_MIGRATE This ioctl operates on the filesystem itself.
|
|
||||||
It converts (migrates) ext3 indirect block mapped
|
|
||||||
inode to ext4 extent mapped inode by walking
|
|
||||||
through indirect block mapping of the original
|
|
||||||
inode and converting contiguous block ranges
|
|
||||||
into ext4 extents of the temporary inode. Then,
|
|
||||||
inodes are swapped. This ioctl might help, when
|
|
||||||
migrating from ext3 to ext4 filesystem, however
|
|
||||||
suggestion is to create fresh ext4 filesystem
|
|
||||||
and copy data from the backup. Note, that
|
|
||||||
filesystem has to support extents for this ioctl
|
|
||||||
to work.
|
|
||||||
|
|
||||||
EXT4_IOC_ALLOC_DA_BLKS Force all of the delay allocated blocks to be
|
|
||||||
allocated to preserve application-expected ext3
|
|
||||||
behaviour. Note that this will also start
|
|
||||||
triggering a write of the data blocks, but this
|
|
||||||
behaviour may change in the future as it is
|
|
||||||
not necessary and has been done this way only
|
|
||||||
for sake of simplicity.
|
|
||||||
|
|
||||||
EXT4_IOC_RESIZE_FS Resize the filesystem to a new size. The number
|
|
||||||
of blocks of resized filesystem is passed in via
|
|
||||||
64 bit integer argument. The kernel allocates
|
|
||||||
bitmaps and inode table, the userspace tool thus
|
|
||||||
just passes the new number of blocks.
|
|
||||||
|
|
||||||
EXT4_IOC_SWAP_BOOT Swap i_blocks and associated attributes
|
|
||||||
(like i_blocks, i_size, i_flags, ...) from
|
|
||||||
the specified inode with inode
|
|
||||||
EXT4_BOOT_LOADER_INO (#5). This is typically
|
|
||||||
used to store a boot loader in a secure part of
|
|
||||||
the filesystem, where it can't be changed by a
|
|
||||||
normal user by accident.
|
|
||||||
The data blocks of the previous boot loader
|
|
||||||
will be associated with the given inode.
|
|
||||||
============================= =================================================
|
|
||||||
|
|
||||||
References
|
|
||||||
==========
|
|
||||||
|
|
||||||
kernel source: <file:fs/ext4/>
|
|
||||||
<file:fs/jbd2/>
|
|
||||||
|
|
||||||
programs: http://e2fsprogs.sourceforge.net/
|
|
||||||
|
|
||||||
useful links: http://fedoraproject.org/wiki/ext3-devel
|
|
||||||
http://www.bullopensource.org/ext4/
|
|
||||||
http://ext4.wiki.kernel.org/index.php/Main_Page
|
|
||||||
http://fedoraproject.org/wiki/Features/Ext4
|
|
@ -43,7 +43,7 @@ entire bitmap.
|
|||||||
The block group descriptor is laid out in ``struct ext4_group_desc``.
|
The block group descriptor is laid out in ``struct ext4_group_desc``.
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
||||||
@ -157,7 +157,7 @@ The block group descriptor is laid out in ``struct ext4_group_desc``.
|
|||||||
Block group flags can be any combination of the following:
|
Block group flags can be any combination of the following:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 79
|
:widths: 16 64
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Value
|
* - Value
|
@ -68,7 +68,7 @@ The extent tree header is recorded in ``struct ext4_extent_header``,
|
|||||||
which is 12 bytes long:
|
which is 12 bytes long:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
||||||
@ -104,7 +104,7 @@ Internal nodes of the extent tree, also known as index nodes, are
|
|||||||
recorded as ``struct ext4_extent_idx``, and are 12 bytes long:
|
recorded as ``struct ext4_extent_idx``, and are 12 bytes long:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
||||||
@ -134,7 +134,7 @@ Leaf nodes of the extent tree are recorded as ``struct ext4_extent``,
|
|||||||
and are also 12 bytes long:
|
and are also 12 bytes long:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
||||||
@ -174,7 +174,7 @@ including) the checksum itself.
|
|||||||
``struct ext4_extent_tail`` is 4 bytes long:
|
``struct ext4_extent_tail`` is 4 bytes long:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
@ -1,17 +1,14 @@
|
|||||||
.. SPDX-License-Identifier: GPL-2.0
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
===============
|
===================================
|
||||||
ext4 Filesystem
|
ext4 Data Structures and Algorithms
|
||||||
===============
|
===================================
|
||||||
|
|
||||||
General usage and on-disk artifacts writen by ext4. More documentation may
|
|
||||||
be ported from the wiki as time permits. This should be considered the
|
|
||||||
canonical source of information as the details here have been reviewed by
|
|
||||||
the ext4 community.
|
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 5
|
:maxdepth: 6
|
||||||
:numbered:
|
:numbered:
|
||||||
|
|
||||||
ext4
|
about.rst
|
||||||
ondisk/index
|
overview.rst
|
||||||
|
globals.rst
|
||||||
|
dynamic.rst
|
||||||
|
@ -29,8 +29,9 @@ and the inode structure itself.
|
|||||||
The inode table entry is laid out in ``struct ext4_inode``.
|
The inode table entry is laid out in ``struct ext4_inode``.
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
:class: longtable
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
||||||
- Size
|
- Size
|
||||||
@ -176,7 +177,7 @@ The inode table entry is laid out in ``struct ext4_inode``.
|
|||||||
The ``i_mode`` value is a combination of the following flags:
|
The ``i_mode`` value is a combination of the following flags:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 79
|
:widths: 16 64
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Value
|
* - Value
|
||||||
@ -227,7 +228,7 @@ The ``i_mode`` value is a combination of the following flags:
|
|||||||
The ``i_flags`` field is a combination of these values:
|
The ``i_flags`` field is a combination of these values:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 79
|
:widths: 16 64
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Value
|
* - Value
|
||||||
@ -314,7 +315,7 @@ The ``osd1`` field has multiple meanings depending on the creator:
|
|||||||
Linux:
|
Linux:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
||||||
@ -331,7 +332,7 @@ Linux:
|
|||||||
Hurd:
|
Hurd:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
||||||
@ -346,7 +347,7 @@ Hurd:
|
|||||||
Masix:
|
Masix:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
||||||
@ -365,7 +366,7 @@ The ``osd2`` field has multiple meanings depending on the filesystem creator:
|
|||||||
Linux:
|
Linux:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
||||||
@ -402,7 +403,7 @@ Linux:
|
|||||||
Hurd:
|
Hurd:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
||||||
@ -433,7 +434,7 @@ Hurd:
|
|||||||
Masix:
|
Masix:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
@ -48,7 +48,7 @@ Layout
|
|||||||
Generally speaking, the journal has this format:
|
Generally speaking, the journal has this format:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 78
|
:widths: 16 48 16
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Superblock
|
* - Superblock
|
||||||
@ -76,7 +76,7 @@ The journal superblock will be in the next full block after the
|
|||||||
superblock.
|
superblock.
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 1 76
|
:widths: 12 12 12 32 12
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - 1024 bytes of padding
|
* - 1024 bytes of padding
|
||||||
@ -98,7 +98,7 @@ Every block in the journal starts with a common 12-byte header
|
|||||||
``struct journal_header_s``:
|
``struct journal_header_s``:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
||||||
@ -124,7 +124,7 @@ Every block in the journal starts with a common 12-byte header
|
|||||||
The journal block type can be any one of:
|
The journal block type can be any one of:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 79
|
:widths: 16 64
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Value
|
* - Value
|
||||||
@ -154,7 +154,7 @@ The journal superblock is recorded as ``struct journal_superblock_s``,
|
|||||||
which is 1024 bytes long:
|
which is 1024 bytes long:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
||||||
@ -264,7 +264,7 @@ which is 1024 bytes long:
|
|||||||
The journal compat features are any combination of the following:
|
The journal compat features are any combination of the following:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 79
|
:widths: 16 64
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Value
|
* - Value
|
||||||
@ -278,7 +278,7 @@ The journal compat features are any combination of the following:
|
|||||||
The journal incompat features are any combination of the following:
|
The journal incompat features are any combination of the following:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 79
|
:widths: 16 64
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Value
|
* - Value
|
||||||
@ -306,7 +306,7 @@ Journal checksum type codes are one of the following. crc32 or crc32c are the
|
|||||||
most likely choices.
|
most likely choices.
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 79
|
:widths: 16 64
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Value
|
* - Value
|
||||||
@ -330,7 +330,7 @@ described by a data structure, but here is the block structure anyway.
|
|||||||
Descriptor blocks consume at least 36 bytes, but use a full block:
|
Descriptor blocks consume at least 36 bytes, but use a full block:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
||||||
@ -355,7 +355,7 @@ defined as ``struct journal_block_tag3_s``, which looks like the
|
|||||||
following. The size is 16 or 32 bytes.
|
following. The size is 16 or 32 bytes.
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
||||||
@ -400,7 +400,7 @@ following. The size is 16 or 32 bytes.
|
|||||||
The journal tag flags are any combination of the following:
|
The journal tag flags are any combination of the following:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 79
|
:widths: 16 64
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Value
|
* - Value
|
||||||
@ -421,7 +421,7 @@ is defined as ``struct journal_block_tag_s``, which looks like the
|
|||||||
following. The size is 8, 12, 24, or 28 bytes:
|
following. The size is 8, 12, 24, or 28 bytes:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
||||||
@ -471,7 +471,7 @@ JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the block is a
|
|||||||
``struct jbd2_journal_block_tail``, which looks like this:
|
``struct jbd2_journal_block_tail``, which looks like this:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
||||||
@ -513,7 +513,7 @@ Revocation blocks are described in
|
|||||||
length, but use a full block:
|
length, but use a full block:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
||||||
@ -543,7 +543,7 @@ JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the revocation
|
|||||||
block is a ``struct jbd2_journal_revoke_tail``, which has this format:
|
block is a ``struct jbd2_journal_revoke_tail``, which has this format:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
||||||
@ -567,7 +567,7 @@ The commit block is described by ``struct commit_header``, which is 32
|
|||||||
bytes long (but uses a full block):
|
bytes long (but uses a full block):
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
@ -32,7 +32,7 @@ The checksum is calculated against the FS UUID and the MMP structure.
|
|||||||
The MMP structure (``struct mmp_struct``) is as follows:
|
The MMP structure (``struct mmp_struct``) is as follows:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 12 20 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
@ -1,9 +0,0 @@
|
|||||||
.. SPDX-License-Identifier: GPL-2.0
|
|
||||||
|
|
||||||
==============================
|
|
||||||
Data Structures and Algorithms
|
|
||||||
==============================
|
|
||||||
.. include:: about.rst
|
|
||||||
.. include:: overview.rst
|
|
||||||
.. include:: globals.rst
|
|
||||||
.. include:: dynamic.rst
|
|
@ -6,7 +6,7 @@ Special inodes
|
|||||||
ext4 reserves some inode for special features, as follows:
|
ext4 reserves some inode for special features, as follows:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 79
|
:widths: 6 70
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - inode Number
|
* - inode Number
|
@ -19,7 +19,7 @@ The ext4 superblock is laid out as follows in
|
|||||||
``struct ext4_super_block``:
|
``struct ext4_super_block``:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 1 1 77
|
:widths: 8 8 24 40
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Offset
|
* - Offset
|
||||||
@ -483,7 +483,7 @@ The ext4 superblock is laid out as follows in
|
|||||||
The superblock state is some combination of the following:
|
The superblock state is some combination of the following:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 79
|
:widths: 8 72
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Value
|
* - Value
|
||||||
@ -500,7 +500,7 @@ The superblock state is some combination of the following:
|
|||||||
The superblock error policy is one of the following:
|
The superblock error policy is one of the following:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 79
|
:widths: 8 72
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Value
|
* - Value
|
||||||
@ -517,7 +517,7 @@ The superblock error policy is one of the following:
|
|||||||
The filesystem creator is one of the following:
|
The filesystem creator is one of the following:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 79
|
:widths: 8 72
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Value
|
* - Value
|
||||||
@ -538,7 +538,7 @@ The filesystem creator is one of the following:
|
|||||||
The superblock revision is one of the following:
|
The superblock revision is one of the following:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 79
|
:widths: 8 72
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Value
|
* - Value
|
||||||
@ -556,7 +556,7 @@ The superblock compatible features field is a combination of any of the
|
|||||||
following:
|
following:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 79
|
:widths: 16 64
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Value
|
* - Value
|
||||||
@ -595,7 +595,7 @@ The superblock incompatible features field is a combination of any of the
|
|||||||
following:
|
following:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 79
|
:widths: 16 64
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Value
|
* - Value
|
||||||
@ -647,7 +647,7 @@ The superblock read-only compatible features field is a combination of any of
|
|||||||
the following:
|
the following:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 79
|
:widths: 16 64
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Value
|
* - Value
|
||||||
@ -702,7 +702,7 @@ the following:
|
|||||||
The ``s_def_hash_version`` field is one of the following:
|
The ``s_def_hash_version`` field is one of the following:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 79
|
:widths: 8 72
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Value
|
* - Value
|
||||||
@ -725,7 +725,7 @@ The ``s_def_hash_version`` field is one of the following:
|
|||||||
The ``s_default_mount_opts`` field is any combination of the following:
|
The ``s_default_mount_opts`` field is any combination of the following:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 79
|
:widths: 8 72
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Value
|
* - Value
|
||||||
@ -767,7 +767,7 @@ The ``s_default_mount_opts`` field is any combination of the following:
|
|||||||
The ``s_flags`` field is any combination of the following:
|
The ``s_flags`` field is any combination of the following:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 79
|
:widths: 8 72
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Value
|
* - Value
|
||||||
@ -784,7 +784,7 @@ The ``s_flags`` field is any combination of the following:
|
|||||||
The ``s_encrypt_algos`` list can contain any of the following:
|
The ``s_encrypt_algos`` list can contain any of the following:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:widths: 1 79
|
:widths: 8 72
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Value
|
* - Value
|
@ -284,12 +284,16 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
|
|||||||
error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT,
|
error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT,
|
||||||
default_acl, XATTR_CREATE);
|
default_acl, XATTR_CREATE);
|
||||||
posix_acl_release(default_acl);
|
posix_acl_release(default_acl);
|
||||||
|
} else {
|
||||||
|
inode->i_default_acl = NULL;
|
||||||
}
|
}
|
||||||
if (acl) {
|
if (acl) {
|
||||||
if (!error)
|
if (!error)
|
||||||
error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS,
|
error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS,
|
||||||
acl, XATTR_CREATE);
|
acl, XATTR_CREATE);
|
||||||
posix_acl_release(acl);
|
posix_acl_release(acl);
|
||||||
|
} else {
|
||||||
|
inode->i_acl = NULL;
|
||||||
}
|
}
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
@ -628,6 +628,7 @@ enum {
|
|||||||
#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008
|
#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008
|
||||||
#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010
|
#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010
|
||||||
#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
|
#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
|
||||||
|
#define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER 0x0040
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* ioctl commands
|
* ioctl commands
|
||||||
@ -1030,6 +1031,9 @@ struct ext4_inode_info {
|
|||||||
ext4_lblk_t i_da_metadata_calc_last_lblock;
|
ext4_lblk_t i_da_metadata_calc_last_lblock;
|
||||||
int i_da_metadata_calc_len;
|
int i_da_metadata_calc_len;
|
||||||
|
|
||||||
|
/* pending cluster reservations for bigalloc file systems */
|
||||||
|
struct ext4_pending_tree i_pending_tree;
|
||||||
|
|
||||||
/* on-disk additional length */
|
/* on-disk additional length */
|
||||||
__u16 i_extra_isize;
|
__u16 i_extra_isize;
|
||||||
|
|
||||||
@ -1401,7 +1405,8 @@ struct ext4_sb_info {
|
|||||||
u32 s_min_batch_time;
|
u32 s_min_batch_time;
|
||||||
struct block_device *journal_bdev;
|
struct block_device *journal_bdev;
|
||||||
#ifdef CONFIG_QUOTA
|
#ifdef CONFIG_QUOTA
|
||||||
char *s_qf_names[EXT4_MAXQUOTAS]; /* Names of quota files with journalled quota */
|
/* Names of quota files with journalled quota */
|
||||||
|
char __rcu *s_qf_names[EXT4_MAXQUOTAS];
|
||||||
int s_jquota_fmt; /* Format of quota to use */
|
int s_jquota_fmt; /* Format of quota to use */
|
||||||
#endif
|
#endif
|
||||||
unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
|
unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
|
||||||
@ -2483,10 +2488,11 @@ extern int ext4_writepage_trans_blocks(struct inode *);
|
|||||||
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
|
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
|
||||||
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
|
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
|
||||||
loff_t lstart, loff_t lend);
|
loff_t lstart, loff_t lend);
|
||||||
extern int ext4_page_mkwrite(struct vm_fault *vmf);
|
extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
|
||||||
extern int ext4_filemap_fault(struct vm_fault *vmf);
|
extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf);
|
||||||
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
|
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
|
||||||
extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
|
extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
|
||||||
|
extern void ext4_da_release_space(struct inode *inode, int to_free);
|
||||||
extern void ext4_da_update_reserve_space(struct inode *inode,
|
extern void ext4_da_update_reserve_space(struct inode *inode,
|
||||||
int used, int quota_claim);
|
int used, int quota_claim);
|
||||||
extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
|
extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
|
||||||
@ -3142,10 +3148,6 @@ extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t,
|
|||||||
int flags);
|
int flags);
|
||||||
extern void ext4_ext_drop_refs(struct ext4_ext_path *);
|
extern void ext4_ext_drop_refs(struct ext4_ext_path *);
|
||||||
extern int ext4_ext_check_inode(struct inode *inode);
|
extern int ext4_ext_check_inode(struct inode *inode);
|
||||||
extern int ext4_find_delalloc_range(struct inode *inode,
|
|
||||||
ext4_lblk_t lblk_start,
|
|
||||||
ext4_lblk_t lblk_end);
|
|
||||||
extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
|
|
||||||
extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
|
extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
|
||||||
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
|
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
|
||||||
__u64 start, __u64 len);
|
__u64 start, __u64 len);
|
||||||
@ -3156,6 +3158,7 @@ extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
|
|||||||
struct inode *inode2, ext4_lblk_t lblk1,
|
struct inode *inode2, ext4_lblk_t lblk1,
|
||||||
ext4_lblk_t lblk2, ext4_lblk_t count,
|
ext4_lblk_t lblk2, ext4_lblk_t count,
|
||||||
int mark_unwritten,int *err);
|
int mark_unwritten,int *err);
|
||||||
|
extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
|
||||||
|
|
||||||
/* move_extent.c */
|
/* move_extent.c */
|
||||||
extern void ext4_double_down_write_data_sem(struct inode *first,
|
extern void ext4_double_down_write_data_sem(struct inode *first,
|
||||||
|
@ -119,6 +119,19 @@ struct ext4_ext_path {
|
|||||||
struct buffer_head *p_bh;
|
struct buffer_head *p_bh;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Used to record a portion of a cluster found at the beginning or end
|
||||||
|
* of an extent while traversing the extent tree during space removal.
|
||||||
|
* A partial cluster may be removed if it does not contain blocks shared
|
||||||
|
* with extents that aren't being deleted (tofree state). Otherwise,
|
||||||
|
* it cannot be removed (nofree state).
|
||||||
|
*/
|
||||||
|
struct partial_cluster {
|
||||||
|
ext4_fsblk_t pclu; /* physical cluster number */
|
||||||
|
ext4_lblk_t lblk; /* logical block number within logical cluster */
|
||||||
|
enum {initial, tofree, nofree} state;
|
||||||
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* structure for external API
|
* structure for external API
|
||||||
*/
|
*/
|
||||||
|
@ -2351,7 +2351,7 @@ ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
|
|||||||
{
|
{
|
||||||
struct extent_status es;
|
struct extent_status es;
|
||||||
|
|
||||||
ext4_es_find_delayed_extent_range(inode, hole_start,
|
ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start,
|
||||||
hole_start + hole_len - 1, &es);
|
hole_start + hole_len - 1, &es);
|
||||||
if (es.es_len) {
|
if (es.es_len) {
|
||||||
/* There's delayed extent containing lblock? */
|
/* There's delayed extent containing lblock? */
|
||||||
@ -2490,43 +2490,58 @@ static inline int get_default_free_blocks_flags(struct inode *inode)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ext4_rereserve_cluster - increment the reserved cluster count when
|
||||||
|
* freeing a cluster with a pending reservation
|
||||||
|
*
|
||||||
|
* @inode - file containing the cluster
|
||||||
|
* @lblk - logical block in cluster to be reserved
|
||||||
|
*
|
||||||
|
* Increments the reserved cluster count and adjusts quota in a bigalloc
|
||||||
|
* file system when freeing a partial cluster containing at least one
|
||||||
|
* delayed and unwritten block. A partial cluster meeting that
|
||||||
|
* requirement will have a pending reservation. If so, the
|
||||||
|
* RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to
|
||||||
|
* defer reserved and allocated space accounting to a subsequent call
|
||||||
|
* to this function.
|
||||||
|
*/
|
||||||
|
static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk)
|
||||||
|
{
|
||||||
|
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
||||||
|
struct ext4_inode_info *ei = EXT4_I(inode);
|
||||||
|
|
||||||
|
dquot_reclaim_block(inode, EXT4_C2B(sbi, 1));
|
||||||
|
|
||||||
|
spin_lock(&ei->i_block_reservation_lock);
|
||||||
|
ei->i_reserved_data_blocks++;
|
||||||
|
percpu_counter_add(&sbi->s_dirtyclusters_counter, 1);
|
||||||
|
spin_unlock(&ei->i_block_reservation_lock);
|
||||||
|
|
||||||
|
percpu_counter_add(&sbi->s_freeclusters_counter, 1);
|
||||||
|
ext4_remove_pending(inode, lblk);
|
||||||
|
}
|
||||||
|
|
||||||
static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
|
static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
|
||||||
struct ext4_extent *ex,
|
struct ext4_extent *ex,
|
||||||
long long *partial_cluster,
|
struct partial_cluster *partial,
|
||||||
ext4_lblk_t from, ext4_lblk_t to)
|
ext4_lblk_t from, ext4_lblk_t to)
|
||||||
{
|
{
|
||||||
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
||||||
unsigned short ee_len = ext4_ext_get_actual_len(ex);
|
unsigned short ee_len = ext4_ext_get_actual_len(ex);
|
||||||
ext4_fsblk_t pblk;
|
ext4_fsblk_t last_pblk, pblk;
|
||||||
int flags = get_default_free_blocks_flags(inode);
|
ext4_lblk_t num;
|
||||||
|
int flags;
|
||||||
|
|
||||||
/*
|
/* only extent tail removal is allowed */
|
||||||
* For bigalloc file systems, we never free a partial cluster
|
if (from < le32_to_cpu(ex->ee_block) ||
|
||||||
* at the beginning of the extent. Instead, we make a note
|
to != le32_to_cpu(ex->ee_block) + ee_len - 1) {
|
||||||
* that we tried freeing the cluster, and check to see if we
|
ext4_error(sbi->s_sb,
|
||||||
* need to free it on a subsequent call to ext4_remove_blocks,
|
"strange request: removal(2) %u-%u from %u:%u",
|
||||||
* or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
|
from, to, le32_to_cpu(ex->ee_block), ee_len);
|
||||||
*/
|
return 0;
|
||||||
flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
|
|
||||||
|
|
||||||
trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
|
|
||||||
/*
|
|
||||||
* If we have a partial cluster, and it's different from the
|
|
||||||
* cluster of the last block, we need to explicitly free the
|
|
||||||
* partial cluster here.
|
|
||||||
*/
|
|
||||||
pblk = ext4_ext_pblock(ex) + ee_len - 1;
|
|
||||||
if (*partial_cluster > 0 &&
|
|
||||||
*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
|
|
||||||
ext4_free_blocks(handle, inode, NULL,
|
|
||||||
EXT4_C2B(sbi, *partial_cluster),
|
|
||||||
sbi->s_cluster_ratio, flags);
|
|
||||||
*partial_cluster = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef EXTENTS_STATS
|
#ifdef EXTENTS_STATS
|
||||||
{
|
|
||||||
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
|
||||||
spin_lock(&sbi->s_ext_stats_lock);
|
spin_lock(&sbi->s_ext_stats_lock);
|
||||||
sbi->s_ext_blocks += ee_len;
|
sbi->s_ext_blocks += ee_len;
|
||||||
sbi->s_ext_extents++;
|
sbi->s_ext_extents++;
|
||||||
@ -2537,58 +2552,94 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
|
|||||||
if (ext_depth(inode) > sbi->s_depth_max)
|
if (ext_depth(inode) > sbi->s_depth_max)
|
||||||
sbi->s_depth_max = ext_depth(inode);
|
sbi->s_depth_max = ext_depth(inode);
|
||||||
spin_unlock(&sbi->s_ext_stats_lock);
|
spin_unlock(&sbi->s_ext_stats_lock);
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
if (from >= le32_to_cpu(ex->ee_block)
|
|
||||||
&& to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
|
trace_ext4_remove_blocks(inode, ex, from, to, partial);
|
||||||
/* tail removal */
|
|
||||||
ext4_lblk_t num;
|
/*
|
||||||
long long first_cluster;
|
* if we have a partial cluster, and it's different from the
|
||||||
|
* cluster of the last block in the extent, we free it
|
||||||
|
*/
|
||||||
|
last_pblk = ext4_ext_pblock(ex) + ee_len - 1;
|
||||||
|
|
||||||
|
if (partial->state != initial &&
|
||||||
|
partial->pclu != EXT4_B2C(sbi, last_pblk)) {
|
||||||
|
if (partial->state == tofree) {
|
||||||
|
flags = get_default_free_blocks_flags(inode);
|
||||||
|
if (ext4_is_pending(inode, partial->lblk))
|
||||||
|
flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
|
||||||
|
ext4_free_blocks(handle, inode, NULL,
|
||||||
|
EXT4_C2B(sbi, partial->pclu),
|
||||||
|
sbi->s_cluster_ratio, flags);
|
||||||
|
if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
|
||||||
|
ext4_rereserve_cluster(inode, partial->lblk);
|
||||||
|
}
|
||||||
|
partial->state = initial;
|
||||||
|
}
|
||||||
|
|
||||||
num = le32_to_cpu(ex->ee_block) + ee_len - from;
|
num = le32_to_cpu(ex->ee_block) + ee_len - from;
|
||||||
pblk = ext4_ext_pblock(ex) + ee_len - num;
|
pblk = ext4_ext_pblock(ex) + ee_len - num;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Usually we want to free partial cluster at the end of the
|
* We free the partial cluster at the end of the extent (if any),
|
||||||
* extent, except for the situation when the cluster is still
|
* unless the cluster is used by another extent (partial_cluster
|
||||||
* used by any other extent (partial_cluster is negative).
|
* state is nofree). If a partial cluster exists here, it must be
|
||||||
|
* shared with the last block in the extent.
|
||||||
*/
|
*/
|
||||||
if (*partial_cluster < 0 &&
|
flags = get_default_free_blocks_flags(inode);
|
||||||
*partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1))
|
|
||||||
|
/* partial, left end cluster aligned, right end unaligned */
|
||||||
|
if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) &&
|
||||||
|
(EXT4_LBLK_CMASK(sbi, to) >= from) &&
|
||||||
|
(partial->state != nofree)) {
|
||||||
|
if (ext4_is_pending(inode, to))
|
||||||
|
flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
|
||||||
|
ext4_free_blocks(handle, inode, NULL,
|
||||||
|
EXT4_PBLK_CMASK(sbi, last_pblk),
|
||||||
|
sbi->s_cluster_ratio, flags);
|
||||||
|
if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
|
||||||
|
ext4_rereserve_cluster(inode, to);
|
||||||
|
partial->state = initial;
|
||||||
|
flags = get_default_free_blocks_flags(inode);
|
||||||
|
}
|
||||||
|
|
||||||
flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
|
flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
|
||||||
|
|
||||||
ext_debug("free last %u blocks starting %llu partial %lld\n",
|
|
||||||
num, pblk, *partial_cluster);
|
|
||||||
ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
|
|
||||||
/*
|
/*
|
||||||
* If the block range to be freed didn't start at the
|
* For bigalloc file systems, we never free a partial cluster
|
||||||
* beginning of a cluster, and we removed the entire
|
* at the beginning of the extent. Instead, we check to see if we
|
||||||
* extent and the cluster is not used by any other extent,
|
* need to free it on a subsequent call to ext4_remove_blocks,
|
||||||
* save the partial cluster here, since we might need to
|
* or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
|
||||||
* delete if we determine that the truncate or punch hole
|
|
||||||
* operation has removed all of the blocks in the cluster.
|
|
||||||
* If that cluster is used by another extent, preserve its
|
|
||||||
* negative value so it isn't freed later on.
|
|
||||||
*
|
|
||||||
* If the whole extent wasn't freed, we've reached the
|
|
||||||
* start of the truncated/punched region and have finished
|
|
||||||
* removing blocks. If there's a partial cluster here it's
|
|
||||||
* shared with the remainder of the extent and is no longer
|
|
||||||
* a candidate for removal.
|
|
||||||
*/
|
*/
|
||||||
if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) {
|
flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
|
||||||
first_cluster = (long long) EXT4_B2C(sbi, pblk);
|
ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
|
||||||
if (first_cluster != -*partial_cluster)
|
|
||||||
*partial_cluster = first_cluster;
|
/* reset the partial cluster if we've freed past it */
|
||||||
} else {
|
if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk))
|
||||||
*partial_cluster = 0;
|
partial->state = initial;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we've freed the entire extent but the beginning is not left
|
||||||
|
* cluster aligned and is not marked as ineligible for freeing we
|
||||||
|
* record the partial cluster at the beginning of the extent. It
|
||||||
|
* wasn't freed by the preceding ext4_free_blocks() call, and we
|
||||||
|
* need to look farther to the left to determine if it's to be freed
|
||||||
|
* (not shared with another extent). Else, reset the partial
|
||||||
|
* cluster - we're either done freeing or the beginning of the
|
||||||
|
* extent is left cluster aligned.
|
||||||
|
*/
|
||||||
|
if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) {
|
||||||
|
if (partial->state == initial) {
|
||||||
|
partial->pclu = EXT4_B2C(sbi, pblk);
|
||||||
|
partial->lblk = from;
|
||||||
|
partial->state = tofree;
|
||||||
}
|
}
|
||||||
} else
|
} else {
|
||||||
ext4_error(sbi->s_sb, "strange request: removal(2) "
|
partial->state = initial;
|
||||||
"%u-%u from %u:%u",
|
|
||||||
from, to, le32_to_cpu(ex->ee_block), ee_len);
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* ext4_ext_rm_leaf() Removes the extents associated with the
|
* ext4_ext_rm_leaf() Removes the extents associated with the
|
||||||
@ -2608,7 +2659,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
|
|||||||
static int
|
static int
|
||||||
ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
|
ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
|
||||||
struct ext4_ext_path *path,
|
struct ext4_ext_path *path,
|
||||||
long long *partial_cluster,
|
struct partial_cluster *partial,
|
||||||
ext4_lblk_t start, ext4_lblk_t end)
|
ext4_lblk_t start, ext4_lblk_t end)
|
||||||
{
|
{
|
||||||
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
||||||
@ -2640,7 +2691,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
|
|||||||
ex_ee_block = le32_to_cpu(ex->ee_block);
|
ex_ee_block = le32_to_cpu(ex->ee_block);
|
||||||
ex_ee_len = ext4_ext_get_actual_len(ex);
|
ex_ee_len = ext4_ext_get_actual_len(ex);
|
||||||
|
|
||||||
trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
|
trace_ext4_ext_rm_leaf(inode, start, ex, partial);
|
||||||
|
|
||||||
while (ex >= EXT_FIRST_EXTENT(eh) &&
|
while (ex >= EXT_FIRST_EXTENT(eh) &&
|
||||||
ex_ee_block + ex_ee_len > start) {
|
ex_ee_block + ex_ee_len > start) {
|
||||||
@ -2671,8 +2722,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
|
|||||||
*/
|
*/
|
||||||
if (sbi->s_cluster_ratio > 1) {
|
if (sbi->s_cluster_ratio > 1) {
|
||||||
pblk = ext4_ext_pblock(ex);
|
pblk = ext4_ext_pblock(ex);
|
||||||
*partial_cluster =
|
partial->pclu = EXT4_B2C(sbi, pblk);
|
||||||
-(long long) EXT4_B2C(sbi, pblk);
|
partial->state = nofree;
|
||||||
}
|
}
|
||||||
ex--;
|
ex--;
|
||||||
ex_ee_block = le32_to_cpu(ex->ee_block);
|
ex_ee_block = le32_to_cpu(ex->ee_block);
|
||||||
@ -2714,8 +2765,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
|
|||||||
if (err)
|
if (err)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
|
err = ext4_remove_blocks(handle, inode, ex, partial, a, b);
|
||||||
a, b);
|
|
||||||
if (err)
|
if (err)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
@ -2769,18 +2819,23 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
|
|||||||
* If there's a partial cluster and at least one extent remains in
|
* If there's a partial cluster and at least one extent remains in
|
||||||
* the leaf, free the partial cluster if it isn't shared with the
|
* the leaf, free the partial cluster if it isn't shared with the
|
||||||
* current extent. If it is shared with the current extent
|
* current extent. If it is shared with the current extent
|
||||||
* we zero partial_cluster because we've reached the start of the
|
* we reset the partial cluster because we've reached the start of the
|
||||||
* truncated/punched region and we're done removing blocks.
|
* truncated/punched region and we're done removing blocks.
|
||||||
*/
|
*/
|
||||||
if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) {
|
if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) {
|
||||||
pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
|
pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
|
||||||
if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
|
if (partial->pclu != EXT4_B2C(sbi, pblk)) {
|
||||||
|
int flags = get_default_free_blocks_flags(inode);
|
||||||
|
|
||||||
|
if (ext4_is_pending(inode, partial->lblk))
|
||||||
|
flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
|
||||||
ext4_free_blocks(handle, inode, NULL,
|
ext4_free_blocks(handle, inode, NULL,
|
||||||
EXT4_C2B(sbi, *partial_cluster),
|
EXT4_C2B(sbi, partial->pclu),
|
||||||
sbi->s_cluster_ratio,
|
sbi->s_cluster_ratio, flags);
|
||||||
get_default_free_blocks_flags(inode));
|
if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
|
||||||
|
ext4_rereserve_cluster(inode, partial->lblk);
|
||||||
}
|
}
|
||||||
*partial_cluster = 0;
|
partial->state = initial;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* if this leaf is free, then we should
|
/* if this leaf is free, then we should
|
||||||
@ -2819,10 +2874,14 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
|
|||||||
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
||||||
int depth = ext_depth(inode);
|
int depth = ext_depth(inode);
|
||||||
struct ext4_ext_path *path = NULL;
|
struct ext4_ext_path *path = NULL;
|
||||||
long long partial_cluster = 0;
|
struct partial_cluster partial;
|
||||||
handle_t *handle;
|
handle_t *handle;
|
||||||
int i = 0, err = 0;
|
int i = 0, err = 0;
|
||||||
|
|
||||||
|
partial.pclu = 0;
|
||||||
|
partial.lblk = 0;
|
||||||
|
partial.state = initial;
|
||||||
|
|
||||||
ext_debug("truncate since %u to %u\n", start, end);
|
ext_debug("truncate since %u to %u\n", start, end);
|
||||||
|
|
||||||
/* probably first extent we're gonna free will be last in block */
|
/* probably first extent we're gonna free will be last in block */
|
||||||
@ -2882,8 +2941,8 @@ again:
|
|||||||
*/
|
*/
|
||||||
if (sbi->s_cluster_ratio > 1) {
|
if (sbi->s_cluster_ratio > 1) {
|
||||||
pblk = ext4_ext_pblock(ex) + end - ee_block + 2;
|
pblk = ext4_ext_pblock(ex) + end - ee_block + 2;
|
||||||
partial_cluster =
|
partial.pclu = EXT4_B2C(sbi, pblk);
|
||||||
-(long long) EXT4_B2C(sbi, pblk);
|
partial.state = nofree;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -2911,9 +2970,10 @@ again:
|
|||||||
&ex);
|
&ex);
|
||||||
if (err)
|
if (err)
|
||||||
goto out;
|
goto out;
|
||||||
if (pblk)
|
if (pblk) {
|
||||||
partial_cluster =
|
partial.pclu = EXT4_B2C(sbi, pblk);
|
||||||
-(long long) EXT4_B2C(sbi, pblk);
|
partial.state = nofree;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*
|
/*
|
||||||
@ -2948,8 +3008,7 @@ again:
|
|||||||
if (i == depth) {
|
if (i == depth) {
|
||||||
/* this is leaf block */
|
/* this is leaf block */
|
||||||
err = ext4_ext_rm_leaf(handle, inode, path,
|
err = ext4_ext_rm_leaf(handle, inode, path,
|
||||||
&partial_cluster, start,
|
&partial, start, end);
|
||||||
end);
|
|
||||||
/* root level has p_bh == NULL, brelse() eats this */
|
/* root level has p_bh == NULL, brelse() eats this */
|
||||||
brelse(path[i].p_bh);
|
brelse(path[i].p_bh);
|
||||||
path[i].p_bh = NULL;
|
path[i].p_bh = NULL;
|
||||||
@ -3021,21 +3080,24 @@ again:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
trace_ext4_ext_remove_space_done(inode, start, end, depth,
|
trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial,
|
||||||
partial_cluster, path->p_hdr->eh_entries);
|
path->p_hdr->eh_entries);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If we still have something in the partial cluster and we have removed
|
* if there's a partial cluster and we have removed the first extent
|
||||||
* even the first extent, then we should free the blocks in the partial
|
* in the file, then we also free the partial cluster, if any
|
||||||
* cluster as well. (This code will only run when there are no leaves
|
|
||||||
* to the immediate left of the truncated/punched region.)
|
|
||||||
*/
|
*/
|
||||||
if (partial_cluster > 0 && err == 0) {
|
if (partial.state == tofree && err == 0) {
|
||||||
/* don't zero partial_cluster since it's not used afterwards */
|
int flags = get_default_free_blocks_flags(inode);
|
||||||
|
|
||||||
|
if (ext4_is_pending(inode, partial.lblk))
|
||||||
|
flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
|
||||||
ext4_free_blocks(handle, inode, NULL,
|
ext4_free_blocks(handle, inode, NULL,
|
||||||
EXT4_C2B(sbi, partial_cluster),
|
EXT4_C2B(sbi, partial.pclu),
|
||||||
sbi->s_cluster_ratio,
|
sbi->s_cluster_ratio, flags);
|
||||||
get_default_free_blocks_flags(inode));
|
if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
|
||||||
|
ext4_rereserve_cluster(inode, partial.lblk);
|
||||||
|
partial.state = initial;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* TODO: flexible tree reduction should be here */
|
/* TODO: flexible tree reduction should be here */
|
||||||
@ -3819,114 +3881,6 @@ out:
|
|||||||
return ext4_mark_inode_dirty(handle, inode);
|
return ext4_mark_inode_dirty(handle, inode);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* ext4_find_delalloc_range: find delayed allocated block in the given range.
|
|
||||||
*
|
|
||||||
* Return 1 if there is a delalloc block in the range, otherwise 0.
|
|
||||||
*/
|
|
||||||
int ext4_find_delalloc_range(struct inode *inode,
|
|
||||||
ext4_lblk_t lblk_start,
|
|
||||||
ext4_lblk_t lblk_end)
|
|
||||||
{
|
|
||||||
struct extent_status es;
|
|
||||||
|
|
||||||
ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es);
|
|
||||||
if (es.es_len == 0)
|
|
||||||
return 0; /* there is no delay extent in this tree */
|
|
||||||
else if (es.es_lblk <= lblk_start &&
|
|
||||||
lblk_start < es.es_lblk + es.es_len)
|
|
||||||
return 1;
|
|
||||||
else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end)
|
|
||||||
return 1;
|
|
||||||
else
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
|
|
||||||
{
|
|
||||||
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
|
||||||
ext4_lblk_t lblk_start, lblk_end;
|
|
||||||
lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
|
|
||||||
lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
|
|
||||||
|
|
||||||
return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Determines how many complete clusters (out of those specified by the 'map')
|
|
||||||
* are under delalloc and were reserved quota for.
|
|
||||||
* This function is called when we are writing out the blocks that were
|
|
||||||
* originally written with their allocation delayed, but then the space was
|
|
||||||
* allocated using fallocate() before the delayed allocation could be resolved.
|
|
||||||
* The cases to look for are:
|
|
||||||
* ('=' indicated delayed allocated blocks
|
|
||||||
* '-' indicates non-delayed allocated blocks)
|
|
||||||
* (a) partial clusters towards beginning and/or end outside of allocated range
|
|
||||||
* are not delalloc'ed.
|
|
||||||
* Ex:
|
|
||||||
* |----c---=|====c====|====c====|===-c----|
|
|
||||||
* |++++++ allocated ++++++|
|
|
||||||
* ==> 4 complete clusters in above example
|
|
||||||
*
|
|
||||||
* (b) partial cluster (outside of allocated range) towards either end is
|
|
||||||
* marked for delayed allocation. In this case, we will exclude that
|
|
||||||
* cluster.
|
|
||||||
* Ex:
|
|
||||||
* |----====c========|========c========|
|
|
||||||
* |++++++ allocated ++++++|
|
|
||||||
* ==> 1 complete clusters in above example
|
|
||||||
*
|
|
||||||
* Ex:
|
|
||||||
* |================c================|
|
|
||||||
* |++++++ allocated ++++++|
|
|
||||||
* ==> 0 complete clusters in above example
|
|
||||||
*
|
|
||||||
* The ext4_da_update_reserve_space will be called only if we
|
|
||||||
* determine here that there were some "entire" clusters that span
|
|
||||||
* this 'allocated' range.
|
|
||||||
* In the non-bigalloc case, this function will just end up returning num_blks
|
|
||||||
* without ever calling ext4_find_delalloc_range.
|
|
||||||
*/
|
|
||||||
static unsigned int
|
|
||||||
get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
|
|
||||||
unsigned int num_blks)
|
|
||||||
{
|
|
||||||
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
|
||||||
ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
|
|
||||||
ext4_lblk_t lblk_from, lblk_to, c_offset;
|
|
||||||
unsigned int allocated_clusters = 0;
|
|
||||||
|
|
||||||
alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
|
|
||||||
alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
|
|
||||||
|
|
||||||
/* max possible clusters for this allocation */
|
|
||||||
allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
|
|
||||||
|
|
||||||
trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
|
|
||||||
|
|
||||||
/* Check towards left side */
|
|
||||||
c_offset = EXT4_LBLK_COFF(sbi, lblk_start);
|
|
||||||
if (c_offset) {
|
|
||||||
lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start);
|
|
||||||
lblk_to = lblk_from + c_offset - 1;
|
|
||||||
|
|
||||||
if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
|
|
||||||
allocated_clusters--;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Now check towards right. */
|
|
||||||
c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks);
|
|
||||||
if (allocated_clusters && c_offset) {
|
|
||||||
lblk_from = lblk_start + num_blks;
|
|
||||||
lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
|
|
||||||
|
|
||||||
if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
|
|
||||||
allocated_clusters--;
|
|
||||||
}
|
|
||||||
|
|
||||||
return allocated_clusters;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int
|
static int
|
||||||
convert_initialized_extent(handle_t *handle, struct inode *inode,
|
convert_initialized_extent(handle_t *handle, struct inode *inode,
|
||||||
struct ext4_map_blocks *map,
|
struct ext4_map_blocks *map,
|
||||||
@ -4108,23 +4062,6 @@ out:
|
|||||||
}
|
}
|
||||||
map->m_len = allocated;
|
map->m_len = allocated;
|
||||||
|
|
||||||
/*
|
|
||||||
* If we have done fallocate with the offset that is already
|
|
||||||
* delayed allocated, we would have block reservation
|
|
||||||
* and quota reservation done in the delayed write path.
|
|
||||||
* But fallocate would have already updated quota and block
|
|
||||||
* count for this offset. So cancel these reservation
|
|
||||||
*/
|
|
||||||
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
|
|
||||||
unsigned int reserved_clusters;
|
|
||||||
reserved_clusters = get_reserved_cluster_alloc(inode,
|
|
||||||
map->m_lblk, map->m_len);
|
|
||||||
if (reserved_clusters)
|
|
||||||
ext4_da_update_reserve_space(inode,
|
|
||||||
reserved_clusters,
|
|
||||||
0);
|
|
||||||
}
|
|
||||||
|
|
||||||
map_out:
|
map_out:
|
||||||
map->m_flags |= EXT4_MAP_MAPPED;
|
map->m_flags |= EXT4_MAP_MAPPED;
|
||||||
if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
|
if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
|
||||||
@ -4513,77 +4450,39 @@ got_allocated_blocks:
|
|||||||
map->m_flags |= EXT4_MAP_NEW;
|
map->m_flags |= EXT4_MAP_NEW;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Update reserved blocks/metadata blocks after successful
|
* Reduce the reserved cluster count to reflect successful deferred
|
||||||
* block allocation which had been deferred till now.
|
* allocation of delayed allocated clusters or direct allocation of
|
||||||
|
* clusters discovered to be delayed allocated. Once allocated, a
|
||||||
|
* cluster is not included in the reserved count.
|
||||||
*/
|
*/
|
||||||
|
if (test_opt(inode->i_sb, DELALLOC) && !map_from_cluster) {
|
||||||
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
|
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
|
||||||
unsigned int reserved_clusters;
|
|
||||||
/*
|
/*
|
||||||
* Check how many clusters we had reserved this allocated range
|
* When allocating delayed allocated clusters, simply
|
||||||
*/
|
* reduce the reserved cluster count and claim quota
|
||||||
reserved_clusters = get_reserved_cluster_alloc(inode,
|
|
||||||
map->m_lblk, allocated);
|
|
||||||
if (!map_from_cluster) {
|
|
||||||
BUG_ON(allocated_clusters < reserved_clusters);
|
|
||||||
if (reserved_clusters < allocated_clusters) {
|
|
||||||
struct ext4_inode_info *ei = EXT4_I(inode);
|
|
||||||
int reservation = allocated_clusters -
|
|
||||||
reserved_clusters;
|
|
||||||
/*
|
|
||||||
* It seems we claimed few clusters outside of
|
|
||||||
* the range of this allocation. We should give
|
|
||||||
* it back to the reservation pool. This can
|
|
||||||
* happen in the following case:
|
|
||||||
*
|
|
||||||
* * Suppose s_cluster_ratio is 4 (i.e., each
|
|
||||||
* cluster has 4 blocks. Thus, the clusters
|
|
||||||
* are [0-3],[4-7],[8-11]...
|
|
||||||
* * First comes delayed allocation write for
|
|
||||||
* logical blocks 10 & 11. Since there were no
|
|
||||||
* previous delayed allocated blocks in the
|
|
||||||
* range [8-11], we would reserve 1 cluster
|
|
||||||
* for this write.
|
|
||||||
* * Next comes write for logical blocks 3 to 8.
|
|
||||||
* In this case, we will reserve 2 clusters
|
|
||||||
* (for [0-3] and [4-7]; and not for [8-11] as
|
|
||||||
* that range has a delayed allocated blocks.
|
|
||||||
* Thus total reserved clusters now becomes 3.
|
|
||||||
* * Now, during the delayed allocation writeout
|
|
||||||
* time, we will first write blocks [3-8] and
|
|
||||||
* allocate 3 clusters for writing these
|
|
||||||
* blocks. Also, we would claim all these
|
|
||||||
* three clusters above.
|
|
||||||
* * Now when we come here to writeout the
|
|
||||||
* blocks [10-11], we would expect to claim
|
|
||||||
* the reservation of 1 cluster we had made
|
|
||||||
* (and we would claim it since there are no
|
|
||||||
* more delayed allocated blocks in the range
|
|
||||||
* [8-11]. But our reserved cluster count had
|
|
||||||
* already gone to 0.
|
|
||||||
*
|
|
||||||
* Thus, at the step 4 above when we determine
|
|
||||||
* that there are still some unwritten delayed
|
|
||||||
* allocated blocks outside of our current
|
|
||||||
* block range, we should increment the
|
|
||||||
* reserved clusters count so that when the
|
|
||||||
* remaining blocks finally gets written, we
|
|
||||||
* could claim them.
|
|
||||||
*/
|
|
||||||
dquot_reserve_block(inode,
|
|
||||||
EXT4_C2B(sbi, reservation));
|
|
||||||
spin_lock(&ei->i_block_reservation_lock);
|
|
||||||
ei->i_reserved_data_blocks += reservation;
|
|
||||||
spin_unlock(&ei->i_block_reservation_lock);
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
* We will claim quota for all newly allocated blocks.
|
|
||||||
* We're updating the reserved space *after* the
|
|
||||||
* correction above so we do not accidentally free
|
|
||||||
* all the metadata reservation because we might
|
|
||||||
* actually need it later on.
|
|
||||||
*/
|
*/
|
||||||
ext4_da_update_reserve_space(inode, allocated_clusters,
|
ext4_da_update_reserve_space(inode, allocated_clusters,
|
||||||
1);
|
1);
|
||||||
|
} else {
|
||||||
|
ext4_lblk_t lblk, len;
|
||||||
|
unsigned int n;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* When allocating non-delayed allocated clusters
|
||||||
|
* (from fallocate, filemap, DIO, or clusters
|
||||||
|
* allocated when delalloc has been disabled by
|
||||||
|
* ext4_nonda_switch), reduce the reserved cluster
|
||||||
|
* count by the number of allocated clusters that
|
||||||
|
* have previously been delayed allocated. Quota
|
||||||
|
* has been claimed by ext4_mb_new_blocks() above,
|
||||||
|
* so release the quota reservations made for any
|
||||||
|
* previously delayed allocated clusters.
|
||||||
|
*/
|
||||||
|
lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk);
|
||||||
|
len = allocated_clusters << sbi->s_cluster_bits;
|
||||||
|
n = ext4_es_delayed_clu(inode, lblk, len);
|
||||||
|
if (n > 0)
|
||||||
|
ext4_da_update_reserve_space(inode, (int) n, 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -5075,8 +4974,10 @@ static int ext4_find_delayed_extent(struct inode *inode,
|
|||||||
ext4_lblk_t block, next_del;
|
ext4_lblk_t block, next_del;
|
||||||
|
|
||||||
if (newes->es_pblk == 0) {
|
if (newes->es_pblk == 0) {
|
||||||
ext4_es_find_delayed_extent_range(inode, newes->es_lblk,
|
ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
|
||||||
newes->es_lblk + newes->es_len - 1, &es);
|
newes->es_lblk,
|
||||||
|
newes->es_lblk + newes->es_len - 1,
|
||||||
|
&es);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* No extent in extent-tree contains block @newes->es_pblk,
|
* No extent in extent-tree contains block @newes->es_pblk,
|
||||||
@ -5097,7 +4998,8 @@ static int ext4_find_delayed_extent(struct inode *inode,
|
|||||||
}
|
}
|
||||||
|
|
||||||
block = newes->es_lblk + newes->es_len;
|
block = newes->es_lblk + newes->es_len;
|
||||||
ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es);
|
ext4_es_find_extent_range(inode, &ext4_es_is_delayed, block,
|
||||||
|
EXT_MAX_BLOCKS, &es);
|
||||||
if (es.es_len == 0)
|
if (es.es_len == 0)
|
||||||
next_del = EXT_MAX_BLOCKS;
|
next_del = EXT_MAX_BLOCKS;
|
||||||
else
|
else
|
||||||
@ -5958,3 +5860,82 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
|
|||||||
}
|
}
|
||||||
return replaced_count;
|
return replaced_count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ext4_clu_mapped - determine whether any block in a logical cluster has
|
||||||
|
* been mapped to a physical cluster
|
||||||
|
*
|
||||||
|
* @inode - file containing the logical cluster
|
||||||
|
* @lclu - logical cluster of interest
|
||||||
|
*
|
||||||
|
* Returns 1 if any block in the logical cluster is mapped, signifying
|
||||||
|
* that a physical cluster has been allocated for it. Otherwise,
|
||||||
|
* returns 0. Can also return negative error codes. Derived from
|
||||||
|
* ext4_ext_map_blocks().
|
||||||
|
*/
|
||||||
|
int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
|
||||||
|
{
|
||||||
|
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
||||||
|
struct ext4_ext_path *path;
|
||||||
|
int depth, mapped = 0, err = 0;
|
||||||
|
struct ext4_extent *extent;
|
||||||
|
ext4_lblk_t first_lblk, first_lclu, last_lclu;
|
||||||
|
|
||||||
|
/* search for the extent closest to the first block in the cluster */
|
||||||
|
path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
|
||||||
|
if (IS_ERR(path)) {
|
||||||
|
err = PTR_ERR(path);
|
||||||
|
path = NULL;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
depth = ext_depth(inode);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* A consistent leaf must not be empty. This situation is possible,
|
||||||
|
* though, _during_ tree modification, and it's why an assert can't
|
||||||
|
* be put in ext4_find_extent().
|
||||||
|
*/
|
||||||
|
if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
|
||||||
|
EXT4_ERROR_INODE(inode,
|
||||||
|
"bad extent address - lblock: %lu, depth: %d, pblock: %lld",
|
||||||
|
(unsigned long) EXT4_C2B(sbi, lclu),
|
||||||
|
depth, path[depth].p_block);
|
||||||
|
err = -EFSCORRUPTED;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
extent = path[depth].p_ext;
|
||||||
|
|
||||||
|
/* can't be mapped if the extent tree is empty */
|
||||||
|
if (extent == NULL)
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
first_lblk = le32_to_cpu(extent->ee_block);
|
||||||
|
first_lclu = EXT4_B2C(sbi, first_lblk);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Three possible outcomes at this point - found extent spanning
|
||||||
|
* the target cluster, to the left of the target cluster, or to the
|
||||||
|
* right of the target cluster. The first two cases are handled here.
|
||||||
|
* The last case indicates the target cluster is not mapped.
|
||||||
|
*/
|
||||||
|
if (lclu >= first_lclu) {
|
||||||
|
last_lclu = EXT4_B2C(sbi, first_lblk +
|
||||||
|
ext4_ext_get_actual_len(extent) - 1);
|
||||||
|
if (lclu <= last_lclu) {
|
||||||
|
mapped = 1;
|
||||||
|
} else {
|
||||||
|
first_lblk = ext4_ext_next_allocated_block(path);
|
||||||
|
first_lclu = EXT4_B2C(sbi, first_lblk);
|
||||||
|
if (lclu == first_lclu)
|
||||||
|
mapped = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
out:
|
||||||
|
ext4_ext_drop_refs(path);
|
||||||
|
kfree(path);
|
||||||
|
|
||||||
|
return err ? err : mapped;
|
||||||
|
}
|
||||||
|
@ -142,6 +142,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
static struct kmem_cache *ext4_es_cachep;
|
static struct kmem_cache *ext4_es_cachep;
|
||||||
|
static struct kmem_cache *ext4_pending_cachep;
|
||||||
|
|
||||||
static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
|
static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
|
||||||
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
|
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
|
||||||
@ -149,6 +150,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
|
|||||||
static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
|
static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
|
||||||
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
|
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
|
||||||
struct ext4_inode_info *locked_ei);
|
struct ext4_inode_info *locked_ei);
|
||||||
|
static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
|
||||||
|
ext4_lblk_t len);
|
||||||
|
|
||||||
int __init ext4_init_es(void)
|
int __init ext4_init_es(void)
|
||||||
{
|
{
|
||||||
@ -233,15 +236,25 @@ static struct extent_status *__es_tree_search(struct rb_root *root,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* ext4_es_find_delayed_extent_range: find the 1st delayed extent covering
|
* ext4_es_find_extent_range - find extent with specified status within block
|
||||||
* @es->lblk if it exists, otherwise, the next extent after @es->lblk.
|
* range or next extent following block range in
|
||||||
|
* extents status tree
|
||||||
*
|
*
|
||||||
* @inode: the inode which owns delayed extents
|
* @inode - file containing the range
|
||||||
* @lblk: the offset where we start to search
|
* @matching_fn - pointer to function that matches extents with desired status
|
||||||
* @end: the offset where we stop to search
|
* @lblk - logical block defining start of range
|
||||||
* @es: delayed extent that we found
|
* @end - logical block defining end of range
|
||||||
|
* @es - extent found, if any
|
||||||
|
*
|
||||||
|
* Find the first extent within the block range specified by @lblk and @end
|
||||||
|
* in the extents status tree that satisfies @matching_fn. If a match
|
||||||
|
* is found, it's returned in @es. If not, and a matching extent is found
|
||||||
|
* beyond the block range, it's returned in @es. If no match is found, an
|
||||||
|
* extent is returned in @es whose es_lblk, es_len, and es_pblk components
|
||||||
|
* are 0.
|
||||||
*/
|
*/
|
||||||
void ext4_es_find_delayed_extent_range(struct inode *inode,
|
static void __es_find_extent_range(struct inode *inode,
|
||||||
|
int (*matching_fn)(struct extent_status *es),
|
||||||
ext4_lblk_t lblk, ext4_lblk_t end,
|
ext4_lblk_t lblk, ext4_lblk_t end,
|
||||||
struct extent_status *es)
|
struct extent_status *es)
|
||||||
{
|
{
|
||||||
@ -249,14 +262,12 @@ void ext4_es_find_delayed_extent_range(struct inode *inode,
|
|||||||
struct extent_status *es1 = NULL;
|
struct extent_status *es1 = NULL;
|
||||||
struct rb_node *node;
|
struct rb_node *node;
|
||||||
|
|
||||||
BUG_ON(es == NULL);
|
WARN_ON(es == NULL);
|
||||||
BUG_ON(end < lblk);
|
WARN_ON(end < lblk);
|
||||||
trace_ext4_es_find_delayed_extent_range_enter(inode, lblk);
|
|
||||||
|
|
||||||
read_lock(&EXT4_I(inode)->i_es_lock);
|
|
||||||
tree = &EXT4_I(inode)->i_es_tree;
|
tree = &EXT4_I(inode)->i_es_tree;
|
||||||
|
|
||||||
/* find extent in cache firstly */
|
/* see if the extent has been cached */
|
||||||
es->es_lblk = es->es_len = es->es_pblk = 0;
|
es->es_lblk = es->es_len = es->es_pblk = 0;
|
||||||
if (tree->cache_es) {
|
if (tree->cache_es) {
|
||||||
es1 = tree->cache_es;
|
es1 = tree->cache_es;
|
||||||
@ -271,28 +282,133 @@ void ext4_es_find_delayed_extent_range(struct inode *inode,
|
|||||||
es1 = __es_tree_search(&tree->root, lblk);
|
es1 = __es_tree_search(&tree->root, lblk);
|
||||||
|
|
||||||
out:
|
out:
|
||||||
if (es1 && !ext4_es_is_delayed(es1)) {
|
if (es1 && !matching_fn(es1)) {
|
||||||
while ((node = rb_next(&es1->rb_node)) != NULL) {
|
while ((node = rb_next(&es1->rb_node)) != NULL) {
|
||||||
es1 = rb_entry(node, struct extent_status, rb_node);
|
es1 = rb_entry(node, struct extent_status, rb_node);
|
||||||
if (es1->es_lblk > end) {
|
if (es1->es_lblk > end) {
|
||||||
es1 = NULL;
|
es1 = NULL;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (ext4_es_is_delayed(es1))
|
if (matching_fn(es1))
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (es1 && ext4_es_is_delayed(es1)) {
|
if (es1 && matching_fn(es1)) {
|
||||||
tree->cache_es = es1;
|
tree->cache_es = es1;
|
||||||
es->es_lblk = es1->es_lblk;
|
es->es_lblk = es1->es_lblk;
|
||||||
es->es_len = es1->es_len;
|
es->es_len = es1->es_len;
|
||||||
es->es_pblk = es1->es_pblk;
|
es->es_pblk = es1->es_pblk;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Locking for __es_find_extent_range() for external use
|
||||||
|
*/
|
||||||
|
void ext4_es_find_extent_range(struct inode *inode,
|
||||||
|
int (*matching_fn)(struct extent_status *es),
|
||||||
|
ext4_lblk_t lblk, ext4_lblk_t end,
|
||||||
|
struct extent_status *es)
|
||||||
|
{
|
||||||
|
trace_ext4_es_find_extent_range_enter(inode, lblk);
|
||||||
|
|
||||||
|
read_lock(&EXT4_I(inode)->i_es_lock);
|
||||||
|
__es_find_extent_range(inode, matching_fn, lblk, end, es);
|
||||||
read_unlock(&EXT4_I(inode)->i_es_lock);
|
read_unlock(&EXT4_I(inode)->i_es_lock);
|
||||||
|
|
||||||
trace_ext4_es_find_delayed_extent_range_exit(inode, es);
|
trace_ext4_es_find_extent_range_exit(inode, es);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* __es_scan_range - search block range for block with specified status
|
||||||
|
* in extents status tree
|
||||||
|
*
|
||||||
|
* @inode - file containing the range
|
||||||
|
* @matching_fn - pointer to function that matches extents with desired status
|
||||||
|
* @lblk - logical block defining start of range
|
||||||
|
* @end - logical block defining end of range
|
||||||
|
*
|
||||||
|
* Returns true if at least one block in the specified block range satisfies
|
||||||
|
* the criterion specified by @matching_fn, and false if not. If at least
|
||||||
|
* one extent has the specified status, then there is at least one block
|
||||||
|
* in the cluster with that status. Should only be called by code that has
|
||||||
|
* taken i_es_lock.
|
||||||
|
*/
|
||||||
|
static bool __es_scan_range(struct inode *inode,
|
||||||
|
int (*matching_fn)(struct extent_status *es),
|
||||||
|
ext4_lblk_t start, ext4_lblk_t end)
|
||||||
|
{
|
||||||
|
struct extent_status es;
|
||||||
|
|
||||||
|
__es_find_extent_range(inode, matching_fn, start, end, &es);
|
||||||
|
if (es.es_len == 0)
|
||||||
|
return false; /* no matching extent in the tree */
|
||||||
|
else if (es.es_lblk <= start &&
|
||||||
|
start < es.es_lblk + es.es_len)
|
||||||
|
return true;
|
||||||
|
else if (start <= es.es_lblk && es.es_lblk <= end)
|
||||||
|
return true;
|
||||||
|
else
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
* Locking for __es_scan_range() for external use
|
||||||
|
*/
|
||||||
|
bool ext4_es_scan_range(struct inode *inode,
|
||||||
|
int (*matching_fn)(struct extent_status *es),
|
||||||
|
ext4_lblk_t lblk, ext4_lblk_t end)
|
||||||
|
{
|
||||||
|
bool ret;
|
||||||
|
|
||||||
|
read_lock(&EXT4_I(inode)->i_es_lock);
|
||||||
|
ret = __es_scan_range(inode, matching_fn, lblk, end);
|
||||||
|
read_unlock(&EXT4_I(inode)->i_es_lock);
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* __es_scan_clu - search cluster for block with specified status in
|
||||||
|
* extents status tree
|
||||||
|
*
|
||||||
|
* @inode - file containing the cluster
|
||||||
|
* @matching_fn - pointer to function that matches extents with desired status
|
||||||
|
* @lblk - logical block in cluster to be searched
|
||||||
|
*
|
||||||
|
* Returns true if at least one extent in the cluster containing @lblk
|
||||||
|
* satisfies the criterion specified by @matching_fn, and false if not. If at
|
||||||
|
* least one extent has the specified status, then there is at least one block
|
||||||
|
* in the cluster with that status. Should only be called by code that has
|
||||||
|
* taken i_es_lock.
|
||||||
|
*/
|
||||||
|
static bool __es_scan_clu(struct inode *inode,
|
||||||
|
int (*matching_fn)(struct extent_status *es),
|
||||||
|
ext4_lblk_t lblk)
|
||||||
|
{
|
||||||
|
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
||||||
|
ext4_lblk_t lblk_start, lblk_end;
|
||||||
|
|
||||||
|
lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
|
||||||
|
lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
|
||||||
|
|
||||||
|
return __es_scan_range(inode, matching_fn, lblk_start, lblk_end);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Locking for __es_scan_clu() for external use
|
||||||
|
*/
|
||||||
|
bool ext4_es_scan_clu(struct inode *inode,
|
||||||
|
int (*matching_fn)(struct extent_status *es),
|
||||||
|
ext4_lblk_t lblk)
|
||||||
|
{
|
||||||
|
bool ret;
|
||||||
|
|
||||||
|
read_lock(&EXT4_I(inode)->i_es_lock);
|
||||||
|
ret = __es_scan_clu(inode, matching_fn, lblk);
|
||||||
|
read_unlock(&EXT4_I(inode)->i_es_lock);
|
||||||
|
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ext4_es_list_add(struct inode *inode)
|
static void ext4_es_list_add(struct inode *inode)
|
||||||
@ -694,6 +810,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
|
|||||||
struct extent_status newes;
|
struct extent_status newes;
|
||||||
ext4_lblk_t end = lblk + len - 1;
|
ext4_lblk_t end = lblk + len - 1;
|
||||||
int err = 0;
|
int err = 0;
|
||||||
|
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
||||||
|
|
||||||
es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
|
es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
|
||||||
lblk, len, pblk, status, inode->i_ino);
|
lblk, len, pblk, status, inode->i_ino);
|
||||||
@ -730,6 +847,11 @@ retry:
|
|||||||
if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
|
if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
|
||||||
err = 0;
|
err = 0;
|
||||||
|
|
||||||
|
if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) &&
|
||||||
|
(status & EXTENT_STATUS_WRITTEN ||
|
||||||
|
status & EXTENT_STATUS_UNWRITTEN))
|
||||||
|
__revise_pending(inode, lblk, len);
|
||||||
|
|
||||||
error:
|
error:
|
||||||
write_unlock(&EXT4_I(inode)->i_es_lock);
|
write_unlock(&EXT4_I(inode)->i_es_lock);
|
||||||
|
|
||||||
@ -1252,3 +1374,499 @@ static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
|
|||||||
ei->i_es_tree.cache_es = NULL;
|
ei->i_es_tree.cache_es = NULL;
|
||||||
return nr_shrunk;
|
return nr_shrunk;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef ES_DEBUG__
|
||||||
|
static void ext4_print_pending_tree(struct inode *inode)
|
||||||
|
{
|
||||||
|
struct ext4_pending_tree *tree;
|
||||||
|
struct rb_node *node;
|
||||||
|
struct pending_reservation *pr;
|
||||||
|
|
||||||
|
printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino);
|
||||||
|
tree = &EXT4_I(inode)->i_pending_tree;
|
||||||
|
node = rb_first(&tree->root);
|
||||||
|
while (node) {
|
||||||
|
pr = rb_entry(node, struct pending_reservation, rb_node);
|
||||||
|
printk(KERN_DEBUG " %u", pr->lclu);
|
||||||
|
node = rb_next(node);
|
||||||
|
}
|
||||||
|
printk(KERN_DEBUG "\n");
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
#define ext4_print_pending_tree(inode)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int __init ext4_init_pending(void)
|
||||||
|
{
|
||||||
|
ext4_pending_cachep = kmem_cache_create("ext4_pending_reservation",
|
||||||
|
sizeof(struct pending_reservation),
|
||||||
|
0, (SLAB_RECLAIM_ACCOUNT), NULL);
|
||||||
|
if (ext4_pending_cachep == NULL)
|
||||||
|
return -ENOMEM;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ext4_exit_pending(void)
|
||||||
|
{
|
||||||
|
kmem_cache_destroy(ext4_pending_cachep);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ext4_init_pending_tree(struct ext4_pending_tree *tree)
|
||||||
|
{
|
||||||
|
tree->root = RB_ROOT;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* __get_pending - retrieve a pointer to a pending reservation
|
||||||
|
*
|
||||||
|
* @inode - file containing the pending cluster reservation
|
||||||
|
* @lclu - logical cluster of interest
|
||||||
|
*
|
||||||
|
* Returns a pointer to a pending reservation if it's a member of
|
||||||
|
* the set, and NULL if not. Must be called holding i_es_lock.
|
||||||
|
*/
|
||||||
|
static struct pending_reservation *__get_pending(struct inode *inode,
|
||||||
|
ext4_lblk_t lclu)
|
||||||
|
{
|
||||||
|
struct ext4_pending_tree *tree;
|
||||||
|
struct rb_node *node;
|
||||||
|
struct pending_reservation *pr = NULL;
|
||||||
|
|
||||||
|
tree = &EXT4_I(inode)->i_pending_tree;
|
||||||
|
node = (&tree->root)->rb_node;
|
||||||
|
|
||||||
|
while (node) {
|
||||||
|
pr = rb_entry(node, struct pending_reservation, rb_node);
|
||||||
|
if (lclu < pr->lclu)
|
||||||
|
node = node->rb_left;
|
||||||
|
else if (lclu > pr->lclu)
|
||||||
|
node = node->rb_right;
|
||||||
|
else if (lclu == pr->lclu)
|
||||||
|
return pr;
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* __insert_pending - adds a pending cluster reservation to the set of
|
||||||
|
* pending reservations
|
||||||
|
*
|
||||||
|
* @inode - file containing the cluster
|
||||||
|
* @lblk - logical block in the cluster to be added
|
||||||
|
*
|
||||||
|
* Returns 0 on successful insertion and -ENOMEM on failure. If the
|
||||||
|
* pending reservation is already in the set, returns successfully.
|
||||||
|
*/
|
||||||
|
static int __insert_pending(struct inode *inode, ext4_lblk_t lblk)
|
||||||
|
{
|
||||||
|
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
||||||
|
struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
|
||||||
|
struct rb_node **p = &tree->root.rb_node;
|
||||||
|
struct rb_node *parent = NULL;
|
||||||
|
struct pending_reservation *pr;
|
||||||
|
ext4_lblk_t lclu;
|
||||||
|
int ret = 0;
|
||||||
|
|
||||||
|
lclu = EXT4_B2C(sbi, lblk);
|
||||||
|
/* search to find parent for insertion */
|
||||||
|
while (*p) {
|
||||||
|
parent = *p;
|
||||||
|
pr = rb_entry(parent, struct pending_reservation, rb_node);
|
||||||
|
|
||||||
|
if (lclu < pr->lclu) {
|
||||||
|
p = &(*p)->rb_left;
|
||||||
|
} else if (lclu > pr->lclu) {
|
||||||
|
p = &(*p)->rb_right;
|
||||||
|
} else {
|
||||||
|
/* pending reservation already inserted */
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
|
||||||
|
if (pr == NULL) {
|
||||||
|
ret = -ENOMEM;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
pr->lclu = lclu;
|
||||||
|
|
||||||
|
rb_link_node(&pr->rb_node, parent, p);
|
||||||
|
rb_insert_color(&pr->rb_node, &tree->root);
|
||||||
|
|
||||||
|
out:
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* __remove_pending - removes a pending cluster reservation from the set
|
||||||
|
* of pending reservations
|
||||||
|
*
|
||||||
|
* @inode - file containing the cluster
|
||||||
|
* @lblk - logical block in the pending cluster reservation to be removed
|
||||||
|
*
|
||||||
|
* Returns successfully if pending reservation is not a member of the set.
|
||||||
|
*/
|
||||||
|
static void __remove_pending(struct inode *inode, ext4_lblk_t lblk)
|
||||||
|
{
|
||||||
|
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
||||||
|
struct pending_reservation *pr;
|
||||||
|
struct ext4_pending_tree *tree;
|
||||||
|
|
||||||
|
pr = __get_pending(inode, EXT4_B2C(sbi, lblk));
|
||||||
|
if (pr != NULL) {
|
||||||
|
tree = &EXT4_I(inode)->i_pending_tree;
|
||||||
|
rb_erase(&pr->rb_node, &tree->root);
|
||||||
|
kmem_cache_free(ext4_pending_cachep, pr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ext4_remove_pending - removes a pending cluster reservation from the set
|
||||||
|
* of pending reservations
|
||||||
|
*
|
||||||
|
* @inode - file containing the cluster
|
||||||
|
* @lblk - logical block in the pending cluster reservation to be removed
|
||||||
|
*
|
||||||
|
* Locking for external use of __remove_pending.
|
||||||
|
*/
|
||||||
|
void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk)
|
||||||
|
{
|
||||||
|
struct ext4_inode_info *ei = EXT4_I(inode);
|
||||||
|
|
||||||
|
write_lock(&ei->i_es_lock);
|
||||||
|
__remove_pending(inode, lblk);
|
||||||
|
write_unlock(&ei->i_es_lock);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ext4_is_pending - determine whether a cluster has a pending reservation
|
||||||
|
* on it
|
||||||
|
*
|
||||||
|
* @inode - file containing the cluster
|
||||||
|
* @lblk - logical block in the cluster
|
||||||
|
*
|
||||||
|
* Returns true if there's a pending reservation for the cluster in the
|
||||||
|
* set of pending reservations, and false if not.
|
||||||
|
*/
|
||||||
|
bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
|
||||||
|
{
|
||||||
|
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
||||||
|
struct ext4_inode_info *ei = EXT4_I(inode);
|
||||||
|
bool ret;
|
||||||
|
|
||||||
|
read_lock(&ei->i_es_lock);
|
||||||
|
ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL);
|
||||||
|
read_unlock(&ei->i_es_lock);
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ext4_es_insert_delayed_block - adds a delayed block to the extents status
|
||||||
|
* tree, adding a pending reservation where
|
||||||
|
* needed
|
||||||
|
*
|
||||||
|
* @inode - file containing the newly added block
|
||||||
|
* @lblk - logical block to be added
|
||||||
|
* @allocated - indicates whether a physical cluster has been allocated for
|
||||||
|
* the logical cluster that contains the block
|
||||||
|
*
|
||||||
|
* Returns 0 on success, negative error code on failure.
|
||||||
|
*/
|
||||||
|
int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
|
||||||
|
bool allocated)
|
||||||
|
{
|
||||||
|
struct extent_status newes;
|
||||||
|
int err = 0;
|
||||||
|
|
||||||
|
es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
|
||||||
|
lblk, inode->i_ino);
|
||||||
|
|
||||||
|
newes.es_lblk = lblk;
|
||||||
|
newes.es_len = 1;
|
||||||
|
ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
|
||||||
|
trace_ext4_es_insert_delayed_block(inode, &newes, allocated);
|
||||||
|
|
||||||
|
ext4_es_insert_extent_check(inode, &newes);
|
||||||
|
|
||||||
|
write_lock(&EXT4_I(inode)->i_es_lock);
|
||||||
|
|
||||||
|
err = __es_remove_extent(inode, lblk, lblk);
|
||||||
|
if (err != 0)
|
||||||
|
goto error;
|
||||||
|
retry:
|
||||||
|
err = __es_insert_extent(inode, &newes);
|
||||||
|
if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
|
||||||
|
128, EXT4_I(inode)))
|
||||||
|
goto retry;
|
||||||
|
if (err != 0)
|
||||||
|
goto error;
|
||||||
|
|
||||||
|
if (allocated)
|
||||||
|
__insert_pending(inode, lblk);
|
||||||
|
|
||||||
|
error:
|
||||||
|
write_unlock(&EXT4_I(inode)->i_es_lock);
|
||||||
|
|
||||||
|
ext4_es_print_tree(inode);
|
||||||
|
ext4_print_pending_tree(inode);
|
||||||
|
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* __es_delayed_clu - count number of clusters containing blocks that
|
||||||
|
* are delayed only
|
||||||
|
*
|
||||||
|
* @inode - file containing block range
|
||||||
|
* @start - logical block defining start of range
|
||||||
|
* @end - logical block defining end of range
|
||||||
|
*
|
||||||
|
* Returns the number of clusters containing only delayed (not delayed
|
||||||
|
* and unwritten) blocks in the range specified by @start and @end. Any
|
||||||
|
* cluster or part of a cluster within the range and containing a delayed
|
||||||
|
* and not unwritten block within the range is counted as a whole cluster.
|
||||||
|
*/
|
||||||
|
static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start,
|
||||||
|
ext4_lblk_t end)
|
||||||
|
{
|
||||||
|
struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
|
||||||
|
struct extent_status *es;
|
||||||
|
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
||||||
|
struct rb_node *node;
|
||||||
|
ext4_lblk_t first_lclu, last_lclu;
|
||||||
|
unsigned long long last_counted_lclu;
|
||||||
|
unsigned int n = 0;
|
||||||
|
|
||||||
|
/* guaranteed to be unequal to any ext4_lblk_t value */
|
||||||
|
last_counted_lclu = ~0ULL;
|
||||||
|
|
||||||
|
es = __es_tree_search(&tree->root, start);
|
||||||
|
|
||||||
|
while (es && (es->es_lblk <= end)) {
|
||||||
|
if (ext4_es_is_delonly(es)) {
|
||||||
|
if (es->es_lblk <= start)
|
||||||
|
first_lclu = EXT4_B2C(sbi, start);
|
||||||
|
else
|
||||||
|
first_lclu = EXT4_B2C(sbi, es->es_lblk);
|
||||||
|
|
||||||
|
if (ext4_es_end(es) >= end)
|
||||||
|
last_lclu = EXT4_B2C(sbi, end);
|
||||||
|
else
|
||||||
|
last_lclu = EXT4_B2C(sbi, ext4_es_end(es));
|
||||||
|
|
||||||
|
if (first_lclu == last_counted_lclu)
|
||||||
|
n += last_lclu - first_lclu;
|
||||||
|
else
|
||||||
|
n += last_lclu - first_lclu + 1;
|
||||||
|
last_counted_lclu = last_lclu;
|
||||||
|
}
|
||||||
|
node = rb_next(&es->rb_node);
|
||||||
|
if (!node)
|
||||||
|
break;
|
||||||
|
es = rb_entry(node, struct extent_status, rb_node);
|
||||||
|
}
|
||||||
|
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ext4_es_delayed_clu - count number of clusters containing blocks that
|
||||||
|
* are both delayed and unwritten
|
||||||
|
*
|
||||||
|
* @inode - file containing block range
|
||||||
|
* @lblk - logical block defining start of range
|
||||||
|
* @len - number of blocks in range
|
||||||
|
*
|
||||||
|
* Locking for external use of __es_delayed_clu().
|
||||||
|
*/
|
||||||
|
unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
|
||||||
|
ext4_lblk_t len)
|
||||||
|
{
|
||||||
|
struct ext4_inode_info *ei = EXT4_I(inode);
|
||||||
|
ext4_lblk_t end;
|
||||||
|
unsigned int n;
|
||||||
|
|
||||||
|
if (len == 0)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
end = lblk + len - 1;
|
||||||
|
WARN_ON(end < lblk);
|
||||||
|
|
||||||
|
read_lock(&ei->i_es_lock);
|
||||||
|
|
||||||
|
n = __es_delayed_clu(inode, lblk, end);
|
||||||
|
|
||||||
|
read_unlock(&ei->i_es_lock);
|
||||||
|
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* __revise_pending - makes, cancels, or leaves unchanged pending cluster
|
||||||
|
* reservations for a specified block range depending
|
||||||
|
* upon the presence or absence of delayed blocks
|
||||||
|
* outside the range within clusters at the ends of the
|
||||||
|
* range
|
||||||
|
*
|
||||||
|
* @inode - file containing the range
|
||||||
|
* @lblk - logical block defining the start of range
|
||||||
|
* @len - length of range in blocks
|
||||||
|
*
|
||||||
|
* Used after a newly allocated extent is added to the extents status tree.
|
||||||
|
* Requires that the extents in the range have either written or unwritten
|
||||||
|
* status. Must be called while holding i_es_lock.
|
||||||
|
*/
|
||||||
|
static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
|
||||||
|
ext4_lblk_t len)
|
||||||
|
{
|
||||||
|
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
||||||
|
ext4_lblk_t end = lblk + len - 1;
|
||||||
|
ext4_lblk_t first, last;
|
||||||
|
bool f_del = false, l_del = false;
|
||||||
|
|
||||||
|
if (len == 0)
|
||||||
|
return;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Two cases - block range within single cluster and block range
|
||||||
|
* spanning two or more clusters. Note that a cluster belonging
|
||||||
|
* to a range starting and/or ending on a cluster boundary is treated
|
||||||
|
* as if it does not contain a delayed extent. The new range may
|
||||||
|
* have allocated space for previously delayed blocks out to the
|
||||||
|
* cluster boundary, requiring that any pre-existing pending
|
||||||
|
* reservation be canceled. Because this code only looks at blocks
|
||||||
|
* outside the range, it should revise pending reservations
|
||||||
|
* correctly even if the extent represented by the range can't be
|
||||||
|
* inserted in the extents status tree due to ENOSPC.
|
||||||
|
*/
|
||||||
|
|
||||||
|
if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) {
|
||||||
|
first = EXT4_LBLK_CMASK(sbi, lblk);
|
||||||
|
if (first != lblk)
|
||||||
|
f_del = __es_scan_range(inode, &ext4_es_is_delonly,
|
||||||
|
first, lblk - 1);
|
||||||
|
if (f_del) {
|
||||||
|
__insert_pending(inode, first);
|
||||||
|
} else {
|
||||||
|
last = EXT4_LBLK_CMASK(sbi, end) +
|
||||||
|
sbi->s_cluster_ratio - 1;
|
||||||
|
if (last != end)
|
||||||
|
l_del = __es_scan_range(inode,
|
||||||
|
&ext4_es_is_delonly,
|
||||||
|
end + 1, last);
|
||||||
|
if (l_del)
|
||||||
|
__insert_pending(inode, last);
|
||||||
|
else
|
||||||
|
__remove_pending(inode, last);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
first = EXT4_LBLK_CMASK(sbi, lblk);
|
||||||
|
if (first != lblk)
|
||||||
|
f_del = __es_scan_range(inode, &ext4_es_is_delonly,
|
||||||
|
first, lblk - 1);
|
||||||
|
if (f_del)
|
||||||
|
__insert_pending(inode, first);
|
||||||
|
else
|
||||||
|
__remove_pending(inode, first);
|
||||||
|
|
||||||
|
last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
|
||||||
|
if (last != end)
|
||||||
|
l_del = __es_scan_range(inode, &ext4_es_is_delonly,
|
||||||
|
end + 1, last);
|
||||||
|
if (l_del)
|
||||||
|
__insert_pending(inode, last);
|
||||||
|
else
|
||||||
|
__remove_pending(inode, last);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ext4_es_remove_blks - remove block range from extents status tree and
|
||||||
|
* reduce reservation count or cancel pending
|
||||||
|
* reservation as needed
|
||||||
|
*
|
||||||
|
* @inode - file containing range
|
||||||
|
* @lblk - first block in range
|
||||||
|
* @len - number of blocks to remove
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk,
|
||||||
|
ext4_lblk_t len)
|
||||||
|
{
|
||||||
|
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
||||||
|
unsigned int clu_size, reserved = 0;
|
||||||
|
ext4_lblk_t last_lclu, first, length, remainder, last;
|
||||||
|
bool delonly;
|
||||||
|
int err = 0;
|
||||||
|
struct pending_reservation *pr;
|
||||||
|
struct ext4_pending_tree *tree;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Process cluster by cluster for bigalloc - there may be up to
|
||||||
|
* two clusters in a 4k page with a 1k block size and two blocks
|
||||||
|
* per cluster. Also necessary for systems with larger page sizes
|
||||||
|
* and potentially larger block sizes.
|
||||||
|
*/
|
||||||
|
clu_size = sbi->s_cluster_ratio;
|
||||||
|
last_lclu = EXT4_B2C(sbi, lblk + len - 1);
|
||||||
|
|
||||||
|
write_lock(&EXT4_I(inode)->i_es_lock);
|
||||||
|
|
||||||
|
for (first = lblk, remainder = len;
|
||||||
|
remainder > 0;
|
||||||
|
first += length, remainder -= length) {
|
||||||
|
|
||||||
|
if (EXT4_B2C(sbi, first) == last_lclu)
|
||||||
|
length = remainder;
|
||||||
|
else
|
||||||
|
length = clu_size - EXT4_LBLK_COFF(sbi, first);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The BH_Delay flag, which triggers calls to this function,
|
||||||
|
* and the contents of the extents status tree can be
|
||||||
|
* inconsistent due to writepages activity. So, note whether
|
||||||
|
* the blocks to be removed actually belong to an extent with
|
||||||
|
* delayed only status.
|
||||||
|
*/
|
||||||
|
delonly = __es_scan_clu(inode, &ext4_es_is_delonly, first);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* because of the writepages effect, written and unwritten
|
||||||
|
* blocks could be removed here
|
||||||
|
*/
|
||||||
|
last = first + length - 1;
|
||||||
|
err = __es_remove_extent(inode, first, last);
|
||||||
|
if (err)
|
||||||
|
ext4_warning(inode->i_sb,
|
||||||
|
"%s: couldn't remove page (err = %d)",
|
||||||
|
__func__, err);
|
||||||
|
|
||||||
|
/* non-bigalloc case: simply count the cluster for release */
|
||||||
|
if (sbi->s_cluster_ratio == 1 && delonly) {
|
||||||
|
reserved++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* bigalloc case: if all delayed allocated only blocks have
|
||||||
|
* just been removed from a cluster, either cancel a pending
|
||||||
|
* reservation if it exists or count a cluster for release
|
||||||
|
*/
|
||||||
|
if (delonly &&
|
||||||
|
!__es_scan_clu(inode, &ext4_es_is_delonly, first)) {
|
||||||
|
pr = __get_pending(inode, EXT4_B2C(sbi, first));
|
||||||
|
if (pr != NULL) {
|
||||||
|
tree = &EXT4_I(inode)->i_pending_tree;
|
||||||
|
rb_erase(&pr->rb_node, &tree->root);
|
||||||
|
kmem_cache_free(ext4_pending_cachep, pr);
|
||||||
|
} else {
|
||||||
|
reserved++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
write_unlock(&EXT4_I(inode)->i_es_lock);
|
||||||
|
|
||||||
|
ext4_da_release_space(inode, reserved);
|
||||||
|
}
|
||||||
|
@ -78,6 +78,51 @@ struct ext4_es_stats {
|
|||||||
struct percpu_counter es_stats_shk_cnt;
|
struct percpu_counter es_stats_shk_cnt;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Pending cluster reservations for bigalloc file systems
|
||||||
|
*
|
||||||
|
* A cluster with a pending reservation is a logical cluster shared by at
|
||||||
|
* least one extent in the extents status tree with delayed and unwritten
|
||||||
|
* status and at least one other written or unwritten extent. The
|
||||||
|
* reservation is said to be pending because a cluster reservation would
|
||||||
|
* have to be taken in the event all blocks in the cluster shared with
|
||||||
|
* written or unwritten extents were deleted while the delayed and
|
||||||
|
* unwritten blocks remained.
|
||||||
|
*
|
||||||
|
* The set of pending cluster reservations is an auxiliary data structure
|
||||||
|
* used with the extents status tree to implement reserved cluster/block
|
||||||
|
* accounting for bigalloc file systems. The set is kept in memory and
|
||||||
|
* records all pending cluster reservations.
|
||||||
|
*
|
||||||
|
* Its primary function is to avoid the need to read extents from the
|
||||||
|
* disk when invalidating pages as a result of a truncate, punch hole, or
|
||||||
|
* collapse range operation. Page invalidation requires a decrease in the
|
||||||
|
* reserved cluster count if it results in the removal of all delayed
|
||||||
|
* and unwritten extents (blocks) from a cluster that is not shared with a
|
||||||
|
* written or unwritten extent, and no decrease otherwise. Determining
|
||||||
|
* whether the cluster is shared can be done by searching for a pending
|
||||||
|
* reservation on it.
|
||||||
|
*
|
||||||
|
* Secondarily, it provides a potentially faster method for determining
|
||||||
|
* whether the reserved cluster count should be increased when a physical
|
||||||
|
* cluster is deallocated as a result of a truncate, punch hole, or
|
||||||
|
* collapse range operation. The necessary information is also present
|
||||||
|
* in the extents status tree, but might be more rapidly accessed in
|
||||||
|
* the pending reservation set in many cases due to smaller size.
|
||||||
|
*
|
||||||
|
* The pending cluster reservation set is implemented as a red-black tree
|
||||||
|
* with the goal of minimizing per page search time overhead.
|
||||||
|
*/
|
||||||
|
|
||||||
|
struct pending_reservation {
|
||||||
|
struct rb_node rb_node;
|
||||||
|
ext4_lblk_t lclu;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ext4_pending_tree {
|
||||||
|
struct rb_root root;
|
||||||
|
};
|
||||||
|
|
||||||
extern int __init ext4_init_es(void);
|
extern int __init ext4_init_es(void);
|
||||||
extern void ext4_exit_es(void);
|
extern void ext4_exit_es(void);
|
||||||
extern void ext4_es_init_tree(struct ext4_es_tree *tree);
|
extern void ext4_es_init_tree(struct ext4_es_tree *tree);
|
||||||
@ -90,11 +135,18 @@ extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
|
|||||||
unsigned int status);
|
unsigned int status);
|
||||||
extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
|
extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
|
||||||
ext4_lblk_t len);
|
ext4_lblk_t len);
|
||||||
extern void ext4_es_find_delayed_extent_range(struct inode *inode,
|
extern void ext4_es_find_extent_range(struct inode *inode,
|
||||||
|
int (*match_fn)(struct extent_status *es),
|
||||||
ext4_lblk_t lblk, ext4_lblk_t end,
|
ext4_lblk_t lblk, ext4_lblk_t end,
|
||||||
struct extent_status *es);
|
struct extent_status *es);
|
||||||
extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
|
extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
|
||||||
struct extent_status *es);
|
struct extent_status *es);
|
||||||
|
extern bool ext4_es_scan_range(struct inode *inode,
|
||||||
|
int (*matching_fn)(struct extent_status *es),
|
||||||
|
ext4_lblk_t lblk, ext4_lblk_t end);
|
||||||
|
extern bool ext4_es_scan_clu(struct inode *inode,
|
||||||
|
int (*matching_fn)(struct extent_status *es),
|
||||||
|
ext4_lblk_t lblk);
|
||||||
|
|
||||||
static inline unsigned int ext4_es_status(struct extent_status *es)
|
static inline unsigned int ext4_es_status(struct extent_status *es)
|
||||||
{
|
{
|
||||||
@ -126,6 +178,16 @@ static inline int ext4_es_is_hole(struct extent_status *es)
|
|||||||
return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
|
return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline int ext4_es_is_mapped(struct extent_status *es)
|
||||||
|
{
|
||||||
|
return (ext4_es_is_written(es) || ext4_es_is_unwritten(es));
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int ext4_es_is_delonly(struct extent_status *es)
|
||||||
|
{
|
||||||
|
return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es));
|
||||||
|
}
|
||||||
|
|
||||||
static inline void ext4_es_set_referenced(struct extent_status *es)
|
static inline void ext4_es_set_referenced(struct extent_status *es)
|
||||||
{
|
{
|
||||||
es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
|
es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
|
||||||
@ -175,4 +237,16 @@ extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
|
|||||||
|
|
||||||
extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v);
|
extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v);
|
||||||
|
|
||||||
|
extern int __init ext4_init_pending(void);
|
||||||
|
extern void ext4_exit_pending(void);
|
||||||
|
extern void ext4_init_pending_tree(struct ext4_pending_tree *tree);
|
||||||
|
extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
|
||||||
|
extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
|
||||||
|
extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
|
||||||
|
bool allocated);
|
||||||
|
extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
|
||||||
|
ext4_lblk_t len);
|
||||||
|
extern void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk,
|
||||||
|
ext4_lblk_t len);
|
||||||
|
|
||||||
#endif /* _EXT4_EXTENTS_STATUS_H */
|
#endif /* _EXT4_EXTENTS_STATUS_H */
|
||||||
|
@ -863,7 +863,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
|
|||||||
handle_t *handle;
|
handle_t *handle;
|
||||||
struct page *page;
|
struct page *page;
|
||||||
struct ext4_iloc iloc;
|
struct ext4_iloc iloc;
|
||||||
int retries;
|
int retries = 0;
|
||||||
|
|
||||||
ret = ext4_get_inode_loc(inode, &iloc);
|
ret = ext4_get_inode_loc(inode, &iloc);
|
||||||
if (ret)
|
if (ret)
|
||||||
|
138
fs/ext4/inode.c
138
fs/ext4/inode.c
@ -577,7 +577,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
|
|||||||
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
|
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
|
||||||
if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
|
if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
|
||||||
!(status & EXTENT_STATUS_WRITTEN) &&
|
!(status & EXTENT_STATUS_WRITTEN) &&
|
||||||
ext4_find_delalloc_range(inode, map->m_lblk,
|
ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
|
||||||
map->m_lblk + map->m_len - 1))
|
map->m_lblk + map->m_len - 1))
|
||||||
status |= EXTENT_STATUS_DELAYED;
|
status |= EXTENT_STATUS_DELAYED;
|
||||||
ret = ext4_es_insert_extent(inode, map->m_lblk,
|
ret = ext4_es_insert_extent(inode, map->m_lblk,
|
||||||
@ -701,7 +701,7 @@ found:
|
|||||||
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
|
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
|
||||||
if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
|
if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
|
||||||
!(status & EXTENT_STATUS_WRITTEN) &&
|
!(status & EXTENT_STATUS_WRITTEN) &&
|
||||||
ext4_find_delalloc_range(inode, map->m_lblk,
|
ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
|
||||||
map->m_lblk + map->m_len - 1))
|
map->m_lblk + map->m_len - 1))
|
||||||
status |= EXTENT_STATUS_DELAYED;
|
status |= EXTENT_STATUS_DELAYED;
|
||||||
ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
|
ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
|
||||||
@ -1595,7 +1595,7 @@ static int ext4_da_reserve_space(struct inode *inode)
|
|||||||
return 0; /* success */
|
return 0; /* success */
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ext4_da_release_space(struct inode *inode, int to_free)
|
void ext4_da_release_space(struct inode *inode, int to_free)
|
||||||
{
|
{
|
||||||
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
||||||
struct ext4_inode_info *ei = EXT4_I(inode);
|
struct ext4_inode_info *ei = EXT4_I(inode);
|
||||||
@ -1634,13 +1634,11 @@ static void ext4_da_page_release_reservation(struct page *page,
|
|||||||
unsigned int offset,
|
unsigned int offset,
|
||||||
unsigned int length)
|
unsigned int length)
|
||||||
{
|
{
|
||||||
int to_release = 0, contiguous_blks = 0;
|
int contiguous_blks = 0;
|
||||||
struct buffer_head *head, *bh;
|
struct buffer_head *head, *bh;
|
||||||
unsigned int curr_off = 0;
|
unsigned int curr_off = 0;
|
||||||
struct inode *inode = page->mapping->host;
|
struct inode *inode = page->mapping->host;
|
||||||
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
|
||||||
unsigned int stop = offset + length;
|
unsigned int stop = offset + length;
|
||||||
int num_clusters;
|
|
||||||
ext4_fsblk_t lblk;
|
ext4_fsblk_t lblk;
|
||||||
|
|
||||||
BUG_ON(stop > PAGE_SIZE || stop < length);
|
BUG_ON(stop > PAGE_SIZE || stop < length);
|
||||||
@ -1654,7 +1652,6 @@ static void ext4_da_page_release_reservation(struct page *page,
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
if ((offset <= curr_off) && (buffer_delay(bh))) {
|
if ((offset <= curr_off) && (buffer_delay(bh))) {
|
||||||
to_release++;
|
|
||||||
contiguous_blks++;
|
contiguous_blks++;
|
||||||
clear_buffer_delay(bh);
|
clear_buffer_delay(bh);
|
||||||
} else if (contiguous_blks) {
|
} else if (contiguous_blks) {
|
||||||
@ -1662,7 +1659,7 @@ static void ext4_da_page_release_reservation(struct page *page,
|
|||||||
(PAGE_SHIFT - inode->i_blkbits);
|
(PAGE_SHIFT - inode->i_blkbits);
|
||||||
lblk += (curr_off >> inode->i_blkbits) -
|
lblk += (curr_off >> inode->i_blkbits) -
|
||||||
contiguous_blks;
|
contiguous_blks;
|
||||||
ext4_es_remove_extent(inode, lblk, contiguous_blks);
|
ext4_es_remove_blks(inode, lblk, contiguous_blks);
|
||||||
contiguous_blks = 0;
|
contiguous_blks = 0;
|
||||||
}
|
}
|
||||||
curr_off = next_off;
|
curr_off = next_off;
|
||||||
@ -1671,21 +1668,9 @@ static void ext4_da_page_release_reservation(struct page *page,
|
|||||||
if (contiguous_blks) {
|
if (contiguous_blks) {
|
||||||
lblk = page->index << (PAGE_SHIFT - inode->i_blkbits);
|
lblk = page->index << (PAGE_SHIFT - inode->i_blkbits);
|
||||||
lblk += (curr_off >> inode->i_blkbits) - contiguous_blks;
|
lblk += (curr_off >> inode->i_blkbits) - contiguous_blks;
|
||||||
ext4_es_remove_extent(inode, lblk, contiguous_blks);
|
ext4_es_remove_blks(inode, lblk, contiguous_blks);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* If we have released all the blocks belonging to a cluster, then we
|
|
||||||
* need to release the reserved space for that cluster. */
|
|
||||||
num_clusters = EXT4_NUM_B2C(sbi, to_release);
|
|
||||||
while (num_clusters > 0) {
|
|
||||||
lblk = (page->index << (PAGE_SHIFT - inode->i_blkbits)) +
|
|
||||||
((num_clusters - 1) << sbi->s_cluster_bits);
|
|
||||||
if (sbi->s_cluster_ratio == 1 ||
|
|
||||||
!ext4_find_delalloc_cluster(inode, lblk))
|
|
||||||
ext4_da_release_space(inode, 1);
|
|
||||||
|
|
||||||
num_clusters--;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1780,6 +1765,65 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
|
|||||||
return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
|
return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ext4_insert_delayed_block - adds a delayed block to the extents status
|
||||||
|
* tree, incrementing the reserved cluster/block
|
||||||
|
* count or making a pending reservation
|
||||||
|
* where needed
|
||||||
|
*
|
||||||
|
* @inode - file containing the newly added block
|
||||||
|
* @lblk - logical block to be added
|
||||||
|
*
|
||||||
|
* Returns 0 on success, negative error code on failure.
|
||||||
|
*/
|
||||||
|
static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
|
||||||
|
{
|
||||||
|
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
||||||
|
int ret;
|
||||||
|
bool allocated = false;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the cluster containing lblk is shared with a delayed,
|
||||||
|
* written, or unwritten extent in a bigalloc file system, it's
|
||||||
|
* already been accounted for and does not need to be reserved.
|
||||||
|
* A pending reservation must be made for the cluster if it's
|
||||||
|
* shared with a written or unwritten extent and doesn't already
|
||||||
|
* have one. Written and unwritten extents can be purged from the
|
||||||
|
* extents status tree if the system is under memory pressure, so
|
||||||
|
* it's necessary to examine the extent tree if a search of the
|
||||||
|
* extents status tree doesn't get a match.
|
||||||
|
*/
|
||||||
|
if (sbi->s_cluster_ratio == 1) {
|
||||||
|
ret = ext4_da_reserve_space(inode);
|
||||||
|
if (ret != 0) /* ENOSPC */
|
||||||
|
goto errout;
|
||||||
|
} else { /* bigalloc */
|
||||||
|
if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
|
||||||
|
if (!ext4_es_scan_clu(inode,
|
||||||
|
&ext4_es_is_mapped, lblk)) {
|
||||||
|
ret = ext4_clu_mapped(inode,
|
||||||
|
EXT4_B2C(sbi, lblk));
|
||||||
|
if (ret < 0)
|
||||||
|
goto errout;
|
||||||
|
if (ret == 0) {
|
||||||
|
ret = ext4_da_reserve_space(inode);
|
||||||
|
if (ret != 0) /* ENOSPC */
|
||||||
|
goto errout;
|
||||||
|
} else {
|
||||||
|
allocated = true;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
allocated = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = ext4_es_insert_delayed_block(inode, lblk, allocated);
|
||||||
|
|
||||||
|
errout:
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This function is grabs code from the very beginning of
|
* This function is grabs code from the very beginning of
|
||||||
* ext4_map_blocks, but assumes that the caller is from delayed write
|
* ext4_map_blocks, but assumes that the caller is from delayed write
|
||||||
@ -1859,28 +1903,14 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
|
|||||||
add_delayed:
|
add_delayed:
|
||||||
if (retval == 0) {
|
if (retval == 0) {
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* XXX: __block_prepare_write() unmaps passed block,
|
* XXX: __block_prepare_write() unmaps passed block,
|
||||||
* is it OK?
|
* is it OK?
|
||||||
*/
|
*/
|
||||||
/*
|
|
||||||
* If the block was allocated from previously allocated cluster,
|
|
||||||
* then we don't need to reserve it again. However we still need
|
|
||||||
* to reserve metadata for every block we're going to write.
|
|
||||||
*/
|
|
||||||
if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 ||
|
|
||||||
!ext4_find_delalloc_cluster(inode, map->m_lblk)) {
|
|
||||||
ret = ext4_da_reserve_space(inode);
|
|
||||||
if (ret) {
|
|
||||||
/* not enough space to reserve */
|
|
||||||
retval = ret;
|
|
||||||
goto out_unlock;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
|
ret = ext4_insert_delayed_block(inode, map->m_lblk);
|
||||||
~0, EXTENT_STATUS_DELAYED);
|
if (ret != 0) {
|
||||||
if (ret) {
|
|
||||||
retval = ret;
|
retval = ret;
|
||||||
goto out_unlock;
|
goto out_unlock;
|
||||||
}
|
}
|
||||||
@ -3450,7 +3480,8 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
|
|||||||
ext4_lblk_t end = map.m_lblk + map.m_len - 1;
|
ext4_lblk_t end = map.m_lblk + map.m_len - 1;
|
||||||
struct extent_status es;
|
struct extent_status es;
|
||||||
|
|
||||||
ext4_es_find_delayed_extent_range(inode, map.m_lblk, end, &es);
|
ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
|
||||||
|
map.m_lblk, end, &es);
|
||||||
|
|
||||||
if (!es.es_len || es.es_lblk > end) {
|
if (!es.es_len || es.es_lblk > end) {
|
||||||
/* entire range is a hole */
|
/* entire range is a hole */
|
||||||
@ -6153,13 +6184,14 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
|
|||||||
return !buffer_mapped(bh);
|
return !buffer_mapped(bh);
|
||||||
}
|
}
|
||||||
|
|
||||||
int ext4_page_mkwrite(struct vm_fault *vmf)
|
vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
|
||||||
{
|
{
|
||||||
struct vm_area_struct *vma = vmf->vma;
|
struct vm_area_struct *vma = vmf->vma;
|
||||||
struct page *page = vmf->page;
|
struct page *page = vmf->page;
|
||||||
loff_t size;
|
loff_t size;
|
||||||
unsigned long len;
|
unsigned long len;
|
||||||
int ret;
|
int err;
|
||||||
|
vm_fault_t ret;
|
||||||
struct file *file = vma->vm_file;
|
struct file *file = vma->vm_file;
|
||||||
struct inode *inode = file_inode(file);
|
struct inode *inode = file_inode(file);
|
||||||
struct address_space *mapping = inode->i_mapping;
|
struct address_space *mapping = inode->i_mapping;
|
||||||
@ -6172,8 +6204,8 @@ int ext4_page_mkwrite(struct vm_fault *vmf)
|
|||||||
|
|
||||||
down_read(&EXT4_I(inode)->i_mmap_sem);
|
down_read(&EXT4_I(inode)->i_mmap_sem);
|
||||||
|
|
||||||
ret = ext4_convert_inline_data(inode);
|
err = ext4_convert_inline_data(inode);
|
||||||
if (ret)
|
if (err)
|
||||||
goto out_ret;
|
goto out_ret;
|
||||||
|
|
||||||
/* Delalloc case is easy... */
|
/* Delalloc case is easy... */
|
||||||
@ -6181,9 +6213,9 @@ int ext4_page_mkwrite(struct vm_fault *vmf)
|
|||||||
!ext4_should_journal_data(inode) &&
|
!ext4_should_journal_data(inode) &&
|
||||||
!ext4_nonda_switch(inode->i_sb)) {
|
!ext4_nonda_switch(inode->i_sb)) {
|
||||||
do {
|
do {
|
||||||
ret = block_page_mkwrite(vma, vmf,
|
err = block_page_mkwrite(vma, vmf,
|
||||||
ext4_da_get_block_prep);
|
ext4_da_get_block_prep);
|
||||||
} while (ret == -ENOSPC &&
|
} while (err == -ENOSPC &&
|
||||||
ext4_should_retry_alloc(inode->i_sb, &retries));
|
ext4_should_retry_alloc(inode->i_sb, &retries));
|
||||||
goto out_ret;
|
goto out_ret;
|
||||||
}
|
}
|
||||||
@ -6228,8 +6260,8 @@ retry_alloc:
|
|||||||
ret = VM_FAULT_SIGBUS;
|
ret = VM_FAULT_SIGBUS;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
ret = block_page_mkwrite(vma, vmf, get_block);
|
err = block_page_mkwrite(vma, vmf, get_block);
|
||||||
if (!ret && ext4_should_journal_data(inode)) {
|
if (!err && ext4_should_journal_data(inode)) {
|
||||||
if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
|
if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
|
||||||
PAGE_SIZE, NULL, do_journal_get_write_access)) {
|
PAGE_SIZE, NULL, do_journal_get_write_access)) {
|
||||||
unlock_page(page);
|
unlock_page(page);
|
||||||
@ -6240,24 +6272,24 @@ retry_alloc:
|
|||||||
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
|
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
|
||||||
}
|
}
|
||||||
ext4_journal_stop(handle);
|
ext4_journal_stop(handle);
|
||||||
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
|
if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
|
||||||
goto retry_alloc;
|
goto retry_alloc;
|
||||||
out_ret:
|
out_ret:
|
||||||
ret = block_page_mkwrite_return(ret);
|
ret = block_page_mkwrite_return(err);
|
||||||
out:
|
out:
|
||||||
up_read(&EXT4_I(inode)->i_mmap_sem);
|
up_read(&EXT4_I(inode)->i_mmap_sem);
|
||||||
sb_end_pagefault(inode->i_sb);
|
sb_end_pagefault(inode->i_sb);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ext4_filemap_fault(struct vm_fault *vmf)
|
vm_fault_t ext4_filemap_fault(struct vm_fault *vmf)
|
||||||
{
|
{
|
||||||
struct inode *inode = file_inode(vmf->vma->vm_file);
|
struct inode *inode = file_inode(vmf->vma->vm_file);
|
||||||
int err;
|
vm_fault_t ret;
|
||||||
|
|
||||||
down_read(&EXT4_I(inode)->i_mmap_sem);
|
down_read(&EXT4_I(inode)->i_mmap_sem);
|
||||||
err = filemap_fault(vmf);
|
ret = filemap_fault(vmf);
|
||||||
up_read(&EXT4_I(inode)->i_mmap_sem);
|
up_read(&EXT4_I(inode)->i_mmap_sem);
|
||||||
|
|
||||||
return err;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -67,7 +67,6 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
|
|||||||
ei1 = EXT4_I(inode1);
|
ei1 = EXT4_I(inode1);
|
||||||
ei2 = EXT4_I(inode2);
|
ei2 = EXT4_I(inode2);
|
||||||
|
|
||||||
swap(inode1->i_flags, inode2->i_flags);
|
|
||||||
swap(inode1->i_version, inode2->i_version);
|
swap(inode1->i_version, inode2->i_version);
|
||||||
swap(inode1->i_blocks, inode2->i_blocks);
|
swap(inode1->i_blocks, inode2->i_blocks);
|
||||||
swap(inode1->i_bytes, inode2->i_bytes);
|
swap(inode1->i_bytes, inode2->i_bytes);
|
||||||
@ -85,6 +84,21 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
|
|||||||
i_size_write(inode2, isize);
|
i_size_write(inode2, isize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void reset_inode_seed(struct inode *inode)
|
||||||
|
{
|
||||||
|
struct ext4_inode_info *ei = EXT4_I(inode);
|
||||||
|
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
||||||
|
__le32 inum = cpu_to_le32(inode->i_ino);
|
||||||
|
__le32 gen = cpu_to_le32(inode->i_generation);
|
||||||
|
__u32 csum;
|
||||||
|
|
||||||
|
if (!ext4_has_metadata_csum(inode->i_sb))
|
||||||
|
return;
|
||||||
|
|
||||||
|
csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum));
|
||||||
|
ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Swap the information from the given @inode and the inode
|
* Swap the information from the given @inode and the inode
|
||||||
* EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
|
* EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
|
||||||
@ -102,10 +116,13 @@ static long swap_inode_boot_loader(struct super_block *sb,
|
|||||||
struct inode *inode_bl;
|
struct inode *inode_bl;
|
||||||
struct ext4_inode_info *ei_bl;
|
struct ext4_inode_info *ei_bl;
|
||||||
|
|
||||||
if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode))
|
if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) ||
|
||||||
|
IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) ||
|
||||||
|
ext4_has_inline_data(inode))
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
|
||||||
if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
|
if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) ||
|
||||||
|
!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
|
||||||
return -EPERM;
|
return -EPERM;
|
||||||
|
|
||||||
inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
|
inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
|
||||||
@ -120,13 +137,13 @@ static long swap_inode_boot_loader(struct super_block *sb,
|
|||||||
* that only 1 swap_inode_boot_loader is running. */
|
* that only 1 swap_inode_boot_loader is running. */
|
||||||
lock_two_nondirectories(inode, inode_bl);
|
lock_two_nondirectories(inode, inode_bl);
|
||||||
|
|
||||||
truncate_inode_pages(&inode->i_data, 0);
|
|
||||||
truncate_inode_pages(&inode_bl->i_data, 0);
|
|
||||||
|
|
||||||
/* Wait for all existing dio workers */
|
/* Wait for all existing dio workers */
|
||||||
inode_dio_wait(inode);
|
inode_dio_wait(inode);
|
||||||
inode_dio_wait(inode_bl);
|
inode_dio_wait(inode_bl);
|
||||||
|
|
||||||
|
truncate_inode_pages(&inode->i_data, 0);
|
||||||
|
truncate_inode_pages(&inode_bl->i_data, 0);
|
||||||
|
|
||||||
handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
|
handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
|
||||||
if (IS_ERR(handle)) {
|
if (IS_ERR(handle)) {
|
||||||
err = -EINVAL;
|
err = -EINVAL;
|
||||||
@ -159,6 +176,8 @@ static long swap_inode_boot_loader(struct super_block *sb,
|
|||||||
|
|
||||||
inode->i_generation = prandom_u32();
|
inode->i_generation = prandom_u32();
|
||||||
inode_bl->i_generation = prandom_u32();
|
inode_bl->i_generation = prandom_u32();
|
||||||
|
reset_inode_seed(inode);
|
||||||
|
reset_inode_seed(inode_bl);
|
||||||
|
|
||||||
ext4_discard_preallocations(inode);
|
ext4_discard_preallocations(inode);
|
||||||
|
|
||||||
@ -169,6 +188,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
|
|||||||
inode->i_ino, err);
|
inode->i_ino, err);
|
||||||
/* Revert all changes: */
|
/* Revert all changes: */
|
||||||
swap_inode_data(inode, inode_bl);
|
swap_inode_data(inode, inode_bl);
|
||||||
|
ext4_mark_inode_dirty(handle, inode);
|
||||||
} else {
|
} else {
|
||||||
err = ext4_mark_inode_dirty(handle, inode_bl);
|
err = ext4_mark_inode_dirty(handle, inode_bl);
|
||||||
if (err < 0) {
|
if (err < 0) {
|
||||||
@ -178,6 +198,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
|
|||||||
/* Revert all changes: */
|
/* Revert all changes: */
|
||||||
swap_inode_data(inode, inode_bl);
|
swap_inode_data(inode, inode_bl);
|
||||||
ext4_mark_inode_dirty(handle, inode);
|
ext4_mark_inode_dirty(handle, inode);
|
||||||
|
ext4_mark_inode_dirty(handle, inode_bl);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ext4_journal_stop(handle);
|
ext4_journal_stop(handle);
|
||||||
@ -339,19 +360,14 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
|
|||||||
if (projid_eq(kprojid, EXT4_I(inode)->i_projid))
|
if (projid_eq(kprojid, EXT4_I(inode)->i_projid))
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
err = mnt_want_write_file(filp);
|
|
||||||
if (err)
|
|
||||||
return err;
|
|
||||||
|
|
||||||
err = -EPERM;
|
err = -EPERM;
|
||||||
inode_lock(inode);
|
|
||||||
/* Is it quota file? Do not allow user to mess with it */
|
/* Is it quota file? Do not allow user to mess with it */
|
||||||
if (ext4_is_quota_file(inode))
|
if (ext4_is_quota_file(inode))
|
||||||
goto out_unlock;
|
return err;
|
||||||
|
|
||||||
err = ext4_get_inode_loc(inode, &iloc);
|
err = ext4_get_inode_loc(inode, &iloc);
|
||||||
if (err)
|
if (err)
|
||||||
goto out_unlock;
|
return err;
|
||||||
|
|
||||||
raw_inode = ext4_raw_inode(&iloc);
|
raw_inode = ext4_raw_inode(&iloc);
|
||||||
if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) {
|
if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) {
|
||||||
@ -359,20 +375,20 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
|
|||||||
EXT4_SB(sb)->s_want_extra_isize,
|
EXT4_SB(sb)->s_want_extra_isize,
|
||||||
&iloc);
|
&iloc);
|
||||||
if (err)
|
if (err)
|
||||||
goto out_unlock;
|
return err;
|
||||||
} else {
|
} else {
|
||||||
brelse(iloc.bh);
|
brelse(iloc.bh);
|
||||||
}
|
}
|
||||||
|
|
||||||
dquot_initialize(inode);
|
err = dquot_initialize(inode);
|
||||||
|
if (err)
|
||||||
|
return err;
|
||||||
|
|
||||||
handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
|
handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
|
||||||
EXT4_QUOTA_INIT_BLOCKS(sb) +
|
EXT4_QUOTA_INIT_BLOCKS(sb) +
|
||||||
EXT4_QUOTA_DEL_BLOCKS(sb) + 3);
|
EXT4_QUOTA_DEL_BLOCKS(sb) + 3);
|
||||||
if (IS_ERR(handle)) {
|
if (IS_ERR(handle))
|
||||||
err = PTR_ERR(handle);
|
return PTR_ERR(handle);
|
||||||
goto out_unlock;
|
|
||||||
}
|
|
||||||
|
|
||||||
err = ext4_reserve_inode_write(handle, inode, &iloc);
|
err = ext4_reserve_inode_write(handle, inode, &iloc);
|
||||||
if (err)
|
if (err)
|
||||||
@ -400,9 +416,6 @@ out_dirty:
|
|||||||
err = rc;
|
err = rc;
|
||||||
out_stop:
|
out_stop:
|
||||||
ext4_journal_stop(handle);
|
ext4_journal_stop(handle);
|
||||||
out_unlock:
|
|
||||||
inode_unlock(inode);
|
|
||||||
mnt_drop_write_file(filp);
|
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
@ -626,6 +639,30 @@ group_add_out:
|
|||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int ext4_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Project Quota ID state is only allowed to change from within the init
|
||||||
|
* namespace. Enforce that restriction only if we are trying to change
|
||||||
|
* the quota ID state. Everything else is allowed in user namespaces.
|
||||||
|
*/
|
||||||
|
if (current_user_ns() == &init_user_ns)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (__kprojid_val(EXT4_I(inode)->i_projid) != fa->fsx_projid)
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
|
if (ext4_test_inode_flag(inode, EXT4_INODE_PROJINHERIT)) {
|
||||||
|
if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
|
||||||
|
return -EINVAL;
|
||||||
|
} else {
|
||||||
|
if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
|
long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
|
||||||
{
|
{
|
||||||
struct inode *inode = file_inode(filp);
|
struct inode *inode = file_inode(filp);
|
||||||
@ -1025,19 +1062,19 @@ resizefs_out:
|
|||||||
return err;
|
return err;
|
||||||
|
|
||||||
inode_lock(inode);
|
inode_lock(inode);
|
||||||
|
err = ext4_ioctl_check_project(inode, &fa);
|
||||||
|
if (err)
|
||||||
|
goto out;
|
||||||
flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) |
|
flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) |
|
||||||
(flags & EXT4_FL_XFLAG_VISIBLE);
|
(flags & EXT4_FL_XFLAG_VISIBLE);
|
||||||
err = ext4_ioctl_setflags(inode, flags);
|
err = ext4_ioctl_setflags(inode, flags);
|
||||||
|
if (err)
|
||||||
|
goto out;
|
||||||
|
err = ext4_ioctl_setproject(filp, fa.fsx_projid);
|
||||||
|
out:
|
||||||
inode_unlock(inode);
|
inode_unlock(inode);
|
||||||
mnt_drop_write_file(filp);
|
mnt_drop_write_file(filp);
|
||||||
if (err)
|
|
||||||
return err;
|
return err;
|
||||||
|
|
||||||
err = ext4_ioctl_setproject(filp, fa.fsx_projid);
|
|
||||||
if (err)
|
|
||||||
return err;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
case EXT4_IOC_SHUTDOWN:
|
case EXT4_IOC_SHUTDOWN:
|
||||||
return ext4_shutdown(sb, arg);
|
return ext4_shutdown(sb, arg);
|
||||||
|
@ -4915,9 +4915,17 @@ do_more:
|
|||||||
&sbi->s_flex_groups[flex_group].free_clusters);
|
&sbi->s_flex_groups[flex_group].free_clusters);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* on a bigalloc file system, defer the s_freeclusters_counter
|
||||||
|
* update to the caller (ext4_remove_space and friends) so they
|
||||||
|
* can determine if a cluster freed here should be rereserved
|
||||||
|
*/
|
||||||
|
if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) {
|
||||||
if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
|
if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
|
||||||
dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
|
dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
|
||||||
percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
|
percpu_counter_add(&sbi->s_freeclusters_counter,
|
||||||
|
count_clusters);
|
||||||
|
}
|
||||||
|
|
||||||
ext4_mb_unload_buddy(&e4b);
|
ext4_mb_unload_buddy(&e4b);
|
||||||
|
|
||||||
|
@ -516,9 +516,13 @@ mext_check_arguments(struct inode *orig_inode,
|
|||||||
orig_inode->i_ino, donor_inode->i_ino);
|
orig_inode->i_ino, donor_inode->i_ino);
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
if (orig_eof < orig_start + *len - 1)
|
if (orig_eof <= orig_start)
|
||||||
|
*len = 0;
|
||||||
|
else if (orig_eof < orig_start + *len - 1)
|
||||||
*len = orig_eof - orig_start;
|
*len = orig_eof - orig_start;
|
||||||
if (donor_eof < donor_start + *len - 1)
|
if (donor_eof <= donor_start)
|
||||||
|
*len = 0;
|
||||||
|
else if (donor_eof < donor_start + *len - 1)
|
||||||
*len = donor_eof - donor_start;
|
*len = donor_eof - donor_start;
|
||||||
if (!*len) {
|
if (!*len) {
|
||||||
ext4_debug("ext4 move extent: len should not be 0 "
|
ext4_debug("ext4 move extent: len should not be 0 "
|
||||||
|
@ -2261,7 +2261,7 @@ again:
|
|||||||
dxroot->info.indirect_levels += 1;
|
dxroot->info.indirect_levels += 1;
|
||||||
dxtrace(printk(KERN_DEBUG
|
dxtrace(printk(KERN_DEBUG
|
||||||
"Creating %d level index...\n",
|
"Creating %d level index...\n",
|
||||||
info->indirect_levels));
|
dxroot->info.indirect_levels));
|
||||||
err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
|
err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
|
||||||
if (err)
|
if (err)
|
||||||
goto journal_error;
|
goto journal_error;
|
||||||
|
@ -914,6 +914,18 @@ static inline void ext4_quota_off_umount(struct super_block *sb)
|
|||||||
for (type = 0; type < EXT4_MAXQUOTAS; type++)
|
for (type = 0; type < EXT4_MAXQUOTAS; type++)
|
||||||
ext4_quota_off(sb, type);
|
ext4_quota_off(sb, type);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This is a helper function which is used in the mount/remount
|
||||||
|
* codepaths (which holds s_umount) to fetch the quota file name.
|
||||||
|
*/
|
||||||
|
static inline char *get_qf_name(struct super_block *sb,
|
||||||
|
struct ext4_sb_info *sbi,
|
||||||
|
int type)
|
||||||
|
{
|
||||||
|
return rcu_dereference_protected(sbi->s_qf_names[type],
|
||||||
|
lockdep_is_held(&sb->s_umount));
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
static inline void ext4_quota_off_umount(struct super_block *sb)
|
static inline void ext4_quota_off_umount(struct super_block *sb)
|
||||||
{
|
{
|
||||||
@ -965,7 +977,7 @@ static void ext4_put_super(struct super_block *sb)
|
|||||||
percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
|
percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
|
||||||
#ifdef CONFIG_QUOTA
|
#ifdef CONFIG_QUOTA
|
||||||
for (i = 0; i < EXT4_MAXQUOTAS; i++)
|
for (i = 0; i < EXT4_MAXQUOTAS; i++)
|
||||||
kfree(sbi->s_qf_names[i]);
|
kfree(get_qf_name(sb, sbi, i));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Debugging code just in case the in-memory inode orphan list
|
/* Debugging code just in case the in-memory inode orphan list
|
||||||
@ -1040,6 +1052,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
|
|||||||
ei->i_da_metadata_calc_len = 0;
|
ei->i_da_metadata_calc_len = 0;
|
||||||
ei->i_da_metadata_calc_last_lblock = 0;
|
ei->i_da_metadata_calc_last_lblock = 0;
|
||||||
spin_lock_init(&(ei->i_block_reservation_lock));
|
spin_lock_init(&(ei->i_block_reservation_lock));
|
||||||
|
ext4_init_pending_tree(&ei->i_pending_tree);
|
||||||
#ifdef CONFIG_QUOTA
|
#ifdef CONFIG_QUOTA
|
||||||
ei->i_reserved_quota = 0;
|
ei->i_reserved_quota = 0;
|
||||||
memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
|
memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
|
||||||
@ -1530,11 +1543,10 @@ static const char deprecated_msg[] =
|
|||||||
static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
|
static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
|
||||||
{
|
{
|
||||||
struct ext4_sb_info *sbi = EXT4_SB(sb);
|
struct ext4_sb_info *sbi = EXT4_SB(sb);
|
||||||
char *qname;
|
char *qname, *old_qname = get_qf_name(sb, sbi, qtype);
|
||||||
int ret = -1;
|
int ret = -1;
|
||||||
|
|
||||||
if (sb_any_quota_loaded(sb) &&
|
if (sb_any_quota_loaded(sb) && !old_qname) {
|
||||||
!sbi->s_qf_names[qtype]) {
|
|
||||||
ext4_msg(sb, KERN_ERR,
|
ext4_msg(sb, KERN_ERR,
|
||||||
"Cannot change journaled "
|
"Cannot change journaled "
|
||||||
"quota options when quota turned on");
|
"quota options when quota turned on");
|
||||||
@ -1551,8 +1563,8 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
|
|||||||
"Not enough memory for storing quotafile name");
|
"Not enough memory for storing quotafile name");
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
if (sbi->s_qf_names[qtype]) {
|
if (old_qname) {
|
||||||
if (strcmp(sbi->s_qf_names[qtype], qname) == 0)
|
if (strcmp(old_qname, qname) == 0)
|
||||||
ret = 1;
|
ret = 1;
|
||||||
else
|
else
|
||||||
ext4_msg(sb, KERN_ERR,
|
ext4_msg(sb, KERN_ERR,
|
||||||
@ -1565,7 +1577,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
|
|||||||
"quotafile must be on filesystem root");
|
"quotafile must be on filesystem root");
|
||||||
goto errout;
|
goto errout;
|
||||||
}
|
}
|
||||||
sbi->s_qf_names[qtype] = qname;
|
rcu_assign_pointer(sbi->s_qf_names[qtype], qname);
|
||||||
set_opt(sb, QUOTA);
|
set_opt(sb, QUOTA);
|
||||||
return 1;
|
return 1;
|
||||||
errout:
|
errout:
|
||||||
@ -1577,15 +1589,16 @@ static int clear_qf_name(struct super_block *sb, int qtype)
|
|||||||
{
|
{
|
||||||
|
|
||||||
struct ext4_sb_info *sbi = EXT4_SB(sb);
|
struct ext4_sb_info *sbi = EXT4_SB(sb);
|
||||||
|
char *old_qname = get_qf_name(sb, sbi, qtype);
|
||||||
|
|
||||||
if (sb_any_quota_loaded(sb) &&
|
if (sb_any_quota_loaded(sb) && old_qname) {
|
||||||
sbi->s_qf_names[qtype]) {
|
|
||||||
ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
|
ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
|
||||||
" when quota turned on");
|
" when quota turned on");
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
kfree(sbi->s_qf_names[qtype]);
|
rcu_assign_pointer(sbi->s_qf_names[qtype], NULL);
|
||||||
sbi->s_qf_names[qtype] = NULL;
|
synchronize_rcu();
|
||||||
|
kfree(old_qname);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -1960,7 +1973,7 @@ static int parse_options(char *options, struct super_block *sb,
|
|||||||
int is_remount)
|
int is_remount)
|
||||||
{
|
{
|
||||||
struct ext4_sb_info *sbi = EXT4_SB(sb);
|
struct ext4_sb_info *sbi = EXT4_SB(sb);
|
||||||
char *p;
|
char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name;
|
||||||
substring_t args[MAX_OPT_ARGS];
|
substring_t args[MAX_OPT_ARGS];
|
||||||
int token;
|
int token;
|
||||||
|
|
||||||
@ -1991,11 +2004,13 @@ static int parse_options(char *options, struct super_block *sb,
|
|||||||
"Cannot enable project quota enforcement.");
|
"Cannot enable project quota enforcement.");
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
|
usr_qf_name = get_qf_name(sb, sbi, USRQUOTA);
|
||||||
if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
|
grp_qf_name = get_qf_name(sb, sbi, GRPQUOTA);
|
||||||
|
if (usr_qf_name || grp_qf_name) {
|
||||||
|
if (test_opt(sb, USRQUOTA) && usr_qf_name)
|
||||||
clear_opt(sb, USRQUOTA);
|
clear_opt(sb, USRQUOTA);
|
||||||
|
|
||||||
if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
|
if (test_opt(sb, GRPQUOTA) && grp_qf_name)
|
||||||
clear_opt(sb, GRPQUOTA);
|
clear_opt(sb, GRPQUOTA);
|
||||||
|
|
||||||
if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
|
if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
|
||||||
@ -2029,6 +2044,7 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
|
|||||||
{
|
{
|
||||||
#if defined(CONFIG_QUOTA)
|
#if defined(CONFIG_QUOTA)
|
||||||
struct ext4_sb_info *sbi = EXT4_SB(sb);
|
struct ext4_sb_info *sbi = EXT4_SB(sb);
|
||||||
|
char *usr_qf_name, *grp_qf_name;
|
||||||
|
|
||||||
if (sbi->s_jquota_fmt) {
|
if (sbi->s_jquota_fmt) {
|
||||||
char *fmtname = "";
|
char *fmtname = "";
|
||||||
@ -2047,11 +2063,14 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
|
|||||||
seq_printf(seq, ",jqfmt=%s", fmtname);
|
seq_printf(seq, ",jqfmt=%s", fmtname);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sbi->s_qf_names[USRQUOTA])
|
rcu_read_lock();
|
||||||
seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]);
|
usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]);
|
||||||
|
grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]);
|
||||||
if (sbi->s_qf_names[GRPQUOTA])
|
if (usr_qf_name)
|
||||||
seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]);
|
seq_show_option(seq, "usrjquota", usr_qf_name);
|
||||||
|
if (grp_qf_name)
|
||||||
|
seq_show_option(seq, "grpjquota", grp_qf_name);
|
||||||
|
rcu_read_unlock();
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -5103,6 +5122,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
|
|||||||
int err = 0;
|
int err = 0;
|
||||||
#ifdef CONFIG_QUOTA
|
#ifdef CONFIG_QUOTA
|
||||||
int i, j;
|
int i, j;
|
||||||
|
char *to_free[EXT4_MAXQUOTAS];
|
||||||
#endif
|
#endif
|
||||||
char *orig_data = kstrdup(data, GFP_KERNEL);
|
char *orig_data = kstrdup(data, GFP_KERNEL);
|
||||||
|
|
||||||
@ -5122,8 +5142,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
|
|||||||
old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
|
old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
|
||||||
for (i = 0; i < EXT4_MAXQUOTAS; i++)
|
for (i = 0; i < EXT4_MAXQUOTAS; i++)
|
||||||
if (sbi->s_qf_names[i]) {
|
if (sbi->s_qf_names[i]) {
|
||||||
old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
|
char *qf_name = get_qf_name(sb, sbi, i);
|
||||||
GFP_KERNEL);
|
|
||||||
|
old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL);
|
||||||
if (!old_opts.s_qf_names[i]) {
|
if (!old_opts.s_qf_names[i]) {
|
||||||
for (j = 0; j < i; j++)
|
for (j = 0; j < i; j++)
|
||||||
kfree(old_opts.s_qf_names[j]);
|
kfree(old_opts.s_qf_names[j]);
|
||||||
@ -5352,9 +5373,12 @@ restore_opts:
|
|||||||
#ifdef CONFIG_QUOTA
|
#ifdef CONFIG_QUOTA
|
||||||
sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
|
sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
|
||||||
for (i = 0; i < EXT4_MAXQUOTAS; i++) {
|
for (i = 0; i < EXT4_MAXQUOTAS; i++) {
|
||||||
kfree(sbi->s_qf_names[i]);
|
to_free[i] = get_qf_name(sb, sbi, i);
|
||||||
sbi->s_qf_names[i] = old_opts.s_qf_names[i];
|
rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]);
|
||||||
}
|
}
|
||||||
|
synchronize_rcu();
|
||||||
|
for (i = 0; i < EXT4_MAXQUOTAS; i++)
|
||||||
|
kfree(to_free[i]);
|
||||||
#endif
|
#endif
|
||||||
kfree(orig_data);
|
kfree(orig_data);
|
||||||
return err;
|
return err;
|
||||||
@ -5545,7 +5569,7 @@ static int ext4_write_info(struct super_block *sb, int type)
|
|||||||
*/
|
*/
|
||||||
static int ext4_quota_on_mount(struct super_block *sb, int type)
|
static int ext4_quota_on_mount(struct super_block *sb, int type)
|
||||||
{
|
{
|
||||||
return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
|
return dquot_quota_on_mount(sb, get_qf_name(sb, EXT4_SB(sb), type),
|
||||||
EXT4_SB(sb)->s_jquota_fmt, type);
|
EXT4_SB(sb)->s_jquota_fmt, type);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -5954,6 +5978,10 @@ static int __init ext4_init_fs(void)
|
|||||||
if (err)
|
if (err)
|
||||||
return err;
|
return err;
|
||||||
|
|
||||||
|
err = ext4_init_pending();
|
||||||
|
if (err)
|
||||||
|
goto out6;
|
||||||
|
|
||||||
err = ext4_init_pageio();
|
err = ext4_init_pageio();
|
||||||
if (err)
|
if (err)
|
||||||
goto out5;
|
goto out5;
|
||||||
@ -5992,6 +6020,8 @@ out3:
|
|||||||
out4:
|
out4:
|
||||||
ext4_exit_pageio();
|
ext4_exit_pageio();
|
||||||
out5:
|
out5:
|
||||||
|
ext4_exit_pending();
|
||||||
|
out6:
|
||||||
ext4_exit_es();
|
ext4_exit_es();
|
||||||
|
|
||||||
return err;
|
return err;
|
||||||
@ -6009,6 +6039,7 @@ static void __exit ext4_exit_fs(void)
|
|||||||
ext4_exit_system_zone();
|
ext4_exit_system_zone();
|
||||||
ext4_exit_pageio();
|
ext4_exit_pageio();
|
||||||
ext4_exit_es();
|
ext4_exit_es();
|
||||||
|
ext4_exit_pending();
|
||||||
}
|
}
|
||||||
|
|
||||||
MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
|
MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
|
||||||
|
@ -251,8 +251,8 @@ restart:
|
|||||||
bh = jh2bh(jh);
|
bh = jh2bh(jh);
|
||||||
|
|
||||||
if (buffer_locked(bh)) {
|
if (buffer_locked(bh)) {
|
||||||
spin_unlock(&journal->j_list_lock);
|
|
||||||
get_bh(bh);
|
get_bh(bh);
|
||||||
|
spin_unlock(&journal->j_list_lock);
|
||||||
wait_on_buffer(bh);
|
wait_on_buffer(bh);
|
||||||
/* the journal_head may have gone by now */
|
/* the journal_head may have gone by now */
|
||||||
BUFFER_TRACE(bh, "brelse");
|
BUFFER_TRACE(bh, "brelse");
|
||||||
@ -333,8 +333,8 @@ restart2:
|
|||||||
jh = transaction->t_checkpoint_io_list;
|
jh = transaction->t_checkpoint_io_list;
|
||||||
bh = jh2bh(jh);
|
bh = jh2bh(jh);
|
||||||
if (buffer_locked(bh)) {
|
if (buffer_locked(bh)) {
|
||||||
spin_unlock(&journal->j_list_lock);
|
|
||||||
get_bh(bh);
|
get_bh(bh);
|
||||||
|
spin_unlock(&journal->j_list_lock);
|
||||||
wait_on_buffer(bh);
|
wait_on_buffer(bh);
|
||||||
/* the journal_head may have gone by now */
|
/* the journal_head may have gone by now */
|
||||||
BUFFER_TRACE(bh, "brelse");
|
BUFFER_TRACE(bh, "brelse");
|
||||||
|
@ -242,7 +242,7 @@ int block_commit_write(struct page *page, unsigned from, unsigned to);
|
|||||||
int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
|
int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
|
||||||
get_block_t get_block);
|
get_block_t get_block);
|
||||||
/* Convert errno to return value from ->page_mkwrite() call */
|
/* Convert errno to return value from ->page_mkwrite() call */
|
||||||
static inline int block_page_mkwrite_return(int err)
|
static inline vm_fault_t block_page_mkwrite_return(int err)
|
||||||
{
|
{
|
||||||
if (err == 0)
|
if (err == 0)
|
||||||
return VM_FAULT_LOCKED;
|
return VM_FAULT_LOCKED;
|
||||||
|
@ -17,6 +17,7 @@ struct mpage_da_data;
|
|||||||
struct ext4_map_blocks;
|
struct ext4_map_blocks;
|
||||||
struct extent_status;
|
struct extent_status;
|
||||||
struct ext4_fsmap;
|
struct ext4_fsmap;
|
||||||
|
struct partial_cluster;
|
||||||
|
|
||||||
#define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))
|
#define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))
|
||||||
|
|
||||||
@ -2037,19 +2038,21 @@ TRACE_EVENT(ext4_ext_show_extent,
|
|||||||
TRACE_EVENT(ext4_remove_blocks,
|
TRACE_EVENT(ext4_remove_blocks,
|
||||||
TP_PROTO(struct inode *inode, struct ext4_extent *ex,
|
TP_PROTO(struct inode *inode, struct ext4_extent *ex,
|
||||||
ext4_lblk_t from, ext4_fsblk_t to,
|
ext4_lblk_t from, ext4_fsblk_t to,
|
||||||
long long partial_cluster),
|
struct partial_cluster *pc),
|
||||||
|
|
||||||
TP_ARGS(inode, ex, from, to, partial_cluster),
|
TP_ARGS(inode, ex, from, to, pc),
|
||||||
|
|
||||||
TP_STRUCT__entry(
|
TP_STRUCT__entry(
|
||||||
__field( dev_t, dev )
|
__field( dev_t, dev )
|
||||||
__field( ino_t, ino )
|
__field( ino_t, ino )
|
||||||
__field( ext4_lblk_t, from )
|
__field( ext4_lblk_t, from )
|
||||||
__field( ext4_lblk_t, to )
|
__field( ext4_lblk_t, to )
|
||||||
__field( long long, partial )
|
|
||||||
__field( ext4_fsblk_t, ee_pblk )
|
__field( ext4_fsblk_t, ee_pblk )
|
||||||
__field( ext4_lblk_t, ee_lblk )
|
__field( ext4_lblk_t, ee_lblk )
|
||||||
__field( unsigned short, ee_len )
|
__field( unsigned short, ee_len )
|
||||||
|
__field( ext4_fsblk_t, pc_pclu )
|
||||||
|
__field( ext4_lblk_t, pc_lblk )
|
||||||
|
__field( int, pc_state)
|
||||||
),
|
),
|
||||||
|
|
||||||
TP_fast_assign(
|
TP_fast_assign(
|
||||||
@ -2057,14 +2060,16 @@ TRACE_EVENT(ext4_remove_blocks,
|
|||||||
__entry->ino = inode->i_ino;
|
__entry->ino = inode->i_ino;
|
||||||
__entry->from = from;
|
__entry->from = from;
|
||||||
__entry->to = to;
|
__entry->to = to;
|
||||||
__entry->partial = partial_cluster;
|
|
||||||
__entry->ee_pblk = ext4_ext_pblock(ex);
|
__entry->ee_pblk = ext4_ext_pblock(ex);
|
||||||
__entry->ee_lblk = le32_to_cpu(ex->ee_block);
|
__entry->ee_lblk = le32_to_cpu(ex->ee_block);
|
||||||
__entry->ee_len = ext4_ext_get_actual_len(ex);
|
__entry->ee_len = ext4_ext_get_actual_len(ex);
|
||||||
|
__entry->pc_pclu = pc->pclu;
|
||||||
|
__entry->pc_lblk = pc->lblk;
|
||||||
|
__entry->pc_state = pc->state;
|
||||||
),
|
),
|
||||||
|
|
||||||
TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]"
|
TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]"
|
||||||
"from %u to %u partial_cluster %lld",
|
"from %u to %u partial [pclu %lld lblk %u state %d]",
|
||||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||||
(unsigned long) __entry->ino,
|
(unsigned long) __entry->ino,
|
||||||
(unsigned) __entry->ee_lblk,
|
(unsigned) __entry->ee_lblk,
|
||||||
@ -2072,45 +2077,53 @@ TRACE_EVENT(ext4_remove_blocks,
|
|||||||
(unsigned short) __entry->ee_len,
|
(unsigned short) __entry->ee_len,
|
||||||
(unsigned) __entry->from,
|
(unsigned) __entry->from,
|
||||||
(unsigned) __entry->to,
|
(unsigned) __entry->to,
|
||||||
(long long) __entry->partial)
|
(long long) __entry->pc_pclu,
|
||||||
|
(unsigned int) __entry->pc_lblk,
|
||||||
|
(int) __entry->pc_state)
|
||||||
);
|
);
|
||||||
|
|
||||||
TRACE_EVENT(ext4_ext_rm_leaf,
|
TRACE_EVENT(ext4_ext_rm_leaf,
|
||||||
TP_PROTO(struct inode *inode, ext4_lblk_t start,
|
TP_PROTO(struct inode *inode, ext4_lblk_t start,
|
||||||
struct ext4_extent *ex,
|
struct ext4_extent *ex,
|
||||||
long long partial_cluster),
|
struct partial_cluster *pc),
|
||||||
|
|
||||||
TP_ARGS(inode, start, ex, partial_cluster),
|
TP_ARGS(inode, start, ex, pc),
|
||||||
|
|
||||||
TP_STRUCT__entry(
|
TP_STRUCT__entry(
|
||||||
__field( dev_t, dev )
|
__field( dev_t, dev )
|
||||||
__field( ino_t, ino )
|
__field( ino_t, ino )
|
||||||
__field( long long, partial )
|
|
||||||
__field( ext4_lblk_t, start )
|
__field( ext4_lblk_t, start )
|
||||||
__field( ext4_lblk_t, ee_lblk )
|
__field( ext4_lblk_t, ee_lblk )
|
||||||
__field( ext4_fsblk_t, ee_pblk )
|
__field( ext4_fsblk_t, ee_pblk )
|
||||||
__field( short, ee_len )
|
__field( short, ee_len )
|
||||||
|
__field( ext4_fsblk_t, pc_pclu )
|
||||||
|
__field( ext4_lblk_t, pc_lblk )
|
||||||
|
__field( int, pc_state)
|
||||||
),
|
),
|
||||||
|
|
||||||
TP_fast_assign(
|
TP_fast_assign(
|
||||||
__entry->dev = inode->i_sb->s_dev;
|
__entry->dev = inode->i_sb->s_dev;
|
||||||
__entry->ino = inode->i_ino;
|
__entry->ino = inode->i_ino;
|
||||||
__entry->partial = partial_cluster;
|
|
||||||
__entry->start = start;
|
__entry->start = start;
|
||||||
__entry->ee_lblk = le32_to_cpu(ex->ee_block);
|
__entry->ee_lblk = le32_to_cpu(ex->ee_block);
|
||||||
__entry->ee_pblk = ext4_ext_pblock(ex);
|
__entry->ee_pblk = ext4_ext_pblock(ex);
|
||||||
__entry->ee_len = ext4_ext_get_actual_len(ex);
|
__entry->ee_len = ext4_ext_get_actual_len(ex);
|
||||||
|
__entry->pc_pclu = pc->pclu;
|
||||||
|
__entry->pc_lblk = pc->lblk;
|
||||||
|
__entry->pc_state = pc->state;
|
||||||
),
|
),
|
||||||
|
|
||||||
TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]"
|
TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]"
|
||||||
"partial_cluster %lld",
|
"partial [pclu %lld lblk %u state %d]",
|
||||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||||
(unsigned long) __entry->ino,
|
(unsigned long) __entry->ino,
|
||||||
(unsigned) __entry->start,
|
(unsigned) __entry->start,
|
||||||
(unsigned) __entry->ee_lblk,
|
(unsigned) __entry->ee_lblk,
|
||||||
(unsigned long long) __entry->ee_pblk,
|
(unsigned long long) __entry->ee_pblk,
|
||||||
(unsigned short) __entry->ee_len,
|
(unsigned short) __entry->ee_len,
|
||||||
(long long) __entry->partial)
|
(long long) __entry->pc_pclu,
|
||||||
|
(unsigned int) __entry->pc_lblk,
|
||||||
|
(int) __entry->pc_state)
|
||||||
);
|
);
|
||||||
|
|
||||||
TRACE_EVENT(ext4_ext_rm_idx,
|
TRACE_EVENT(ext4_ext_rm_idx,
|
||||||
@ -2168,9 +2181,9 @@ TRACE_EVENT(ext4_ext_remove_space,
|
|||||||
|
|
||||||
TRACE_EVENT(ext4_ext_remove_space_done,
|
TRACE_EVENT(ext4_ext_remove_space_done,
|
||||||
TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end,
|
TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end,
|
||||||
int depth, long long partial, __le16 eh_entries),
|
int depth, struct partial_cluster *pc, __le16 eh_entries),
|
||||||
|
|
||||||
TP_ARGS(inode, start, end, depth, partial, eh_entries),
|
TP_ARGS(inode, start, end, depth, pc, eh_entries),
|
||||||
|
|
||||||
TP_STRUCT__entry(
|
TP_STRUCT__entry(
|
||||||
__field( dev_t, dev )
|
__field( dev_t, dev )
|
||||||
@ -2178,7 +2191,9 @@ TRACE_EVENT(ext4_ext_remove_space_done,
|
|||||||
__field( ext4_lblk_t, start )
|
__field( ext4_lblk_t, start )
|
||||||
__field( ext4_lblk_t, end )
|
__field( ext4_lblk_t, end )
|
||||||
__field( int, depth )
|
__field( int, depth )
|
||||||
__field( long long, partial )
|
__field( ext4_fsblk_t, pc_pclu )
|
||||||
|
__field( ext4_lblk_t, pc_lblk )
|
||||||
|
__field( int, pc_state )
|
||||||
__field( unsigned short, eh_entries )
|
__field( unsigned short, eh_entries )
|
||||||
),
|
),
|
||||||
|
|
||||||
@ -2188,18 +2203,23 @@ TRACE_EVENT(ext4_ext_remove_space_done,
|
|||||||
__entry->start = start;
|
__entry->start = start;
|
||||||
__entry->end = end;
|
__entry->end = end;
|
||||||
__entry->depth = depth;
|
__entry->depth = depth;
|
||||||
__entry->partial = partial;
|
__entry->pc_pclu = pc->pclu;
|
||||||
|
__entry->pc_lblk = pc->lblk;
|
||||||
|
__entry->pc_state = pc->state;
|
||||||
__entry->eh_entries = le16_to_cpu(eh_entries);
|
__entry->eh_entries = le16_to_cpu(eh_entries);
|
||||||
),
|
),
|
||||||
|
|
||||||
TP_printk("dev %d,%d ino %lu since %u end %u depth %d partial %lld "
|
TP_printk("dev %d,%d ino %lu since %u end %u depth %d "
|
||||||
|
"partial [pclu %lld lblk %u state %d] "
|
||||||
"remaining_entries %u",
|
"remaining_entries %u",
|
||||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||||
(unsigned long) __entry->ino,
|
(unsigned long) __entry->ino,
|
||||||
(unsigned) __entry->start,
|
(unsigned) __entry->start,
|
||||||
(unsigned) __entry->end,
|
(unsigned) __entry->end,
|
||||||
__entry->depth,
|
__entry->depth,
|
||||||
(long long) __entry->partial,
|
(long long) __entry->pc_pclu,
|
||||||
|
(unsigned int) __entry->pc_lblk,
|
||||||
|
(int) __entry->pc_state,
|
||||||
(unsigned short) __entry->eh_entries)
|
(unsigned short) __entry->eh_entries)
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -2270,7 +2290,7 @@ TRACE_EVENT(ext4_es_remove_extent,
|
|||||||
__entry->lblk, __entry->len)
|
__entry->lblk, __entry->len)
|
||||||
);
|
);
|
||||||
|
|
||||||
TRACE_EVENT(ext4_es_find_delayed_extent_range_enter,
|
TRACE_EVENT(ext4_es_find_extent_range_enter,
|
||||||
TP_PROTO(struct inode *inode, ext4_lblk_t lblk),
|
TP_PROTO(struct inode *inode, ext4_lblk_t lblk),
|
||||||
|
|
||||||
TP_ARGS(inode, lblk),
|
TP_ARGS(inode, lblk),
|
||||||
@ -2292,7 +2312,7 @@ TRACE_EVENT(ext4_es_find_delayed_extent_range_enter,
|
|||||||
(unsigned long) __entry->ino, __entry->lblk)
|
(unsigned long) __entry->ino, __entry->lblk)
|
||||||
);
|
);
|
||||||
|
|
||||||
TRACE_EVENT(ext4_es_find_delayed_extent_range_exit,
|
TRACE_EVENT(ext4_es_find_extent_range_exit,
|
||||||
TP_PROTO(struct inode *inode, struct extent_status *es),
|
TP_PROTO(struct inode *inode, struct extent_status *es),
|
||||||
|
|
||||||
TP_ARGS(inode, es),
|
TP_ARGS(inode, es),
|
||||||
@ -2512,6 +2532,41 @@ TRACE_EVENT(ext4_es_shrink,
|
|||||||
__entry->scan_time, __entry->nr_skipped, __entry->retried)
|
__entry->scan_time, __entry->nr_skipped, __entry->retried)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
TRACE_EVENT(ext4_es_insert_delayed_block,
|
||||||
|
TP_PROTO(struct inode *inode, struct extent_status *es,
|
||||||
|
bool allocated),
|
||||||
|
|
||||||
|
TP_ARGS(inode, es, allocated),
|
||||||
|
|
||||||
|
TP_STRUCT__entry(
|
||||||
|
__field( dev_t, dev )
|
||||||
|
__field( ino_t, ino )
|
||||||
|
__field( ext4_lblk_t, lblk )
|
||||||
|
__field( ext4_lblk_t, len )
|
||||||
|
__field( ext4_fsblk_t, pblk )
|
||||||
|
__field( char, status )
|
||||||
|
__field( bool, allocated )
|
||||||
|
),
|
||||||
|
|
||||||
|
TP_fast_assign(
|
||||||
|
__entry->dev = inode->i_sb->s_dev;
|
||||||
|
__entry->ino = inode->i_ino;
|
||||||
|
__entry->lblk = es->es_lblk;
|
||||||
|
__entry->len = es->es_len;
|
||||||
|
__entry->pblk = ext4_es_pblock(es);
|
||||||
|
__entry->status = ext4_es_status(es);
|
||||||
|
__entry->allocated = allocated;
|
||||||
|
),
|
||||||
|
|
||||||
|
TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s "
|
||||||
|
"allocated %d",
|
||||||
|
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||||
|
(unsigned long) __entry->ino,
|
||||||
|
__entry->lblk, __entry->len,
|
||||||
|
__entry->pblk, show_extent_status(__entry->status),
|
||||||
|
__entry->allocated)
|
||||||
|
);
|
||||||
|
|
||||||
/* fsmap traces */
|
/* fsmap traces */
|
||||||
DECLARE_EVENT_CLASS(ext4_fsmap_class,
|
DECLARE_EVENT_CLASS(ext4_fsmap_class,
|
||||||
TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len,
|
TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len,
|
||||||
|
Loading…
Reference in New Issue
Block a user