mirror of
https://github.com/edk2-porting/linux-next.git
synced 2024-12-20 19:23:57 +08:00
- Largest change for this cycle is the DM zoned target's metadata
version 2 feature that adds support for pairing regular block devices with a zoned device to ease performance impact associated with finite random zones of zoned device. Changes came in 3 batches: first prepared for and then added the ability to pair a single regular block device, second was a batch of fixes to improve zoned's reclaim heuristic, third removed the limitation of only adding a single additional regular block device to allow many devices. Testing has shown linear scaling as more devices are added. - Add new emulated block size (ebs) target that emulates a smaller logical_block_size than a block device supports. Primary use-case is to emulate "512e" devices that have 512 byte logical_block_size and 4KB physical_block_size. This is useful to some legacy applications otherwise wouldn't be ablee to be used on 4K devices because they depend on issuing IO in 512 byte granularity. - Add discard interfaces to DM bufio. First consumer of the interface is the dm-ebs target that makes heavy use of dm-bufio. - Fix DM crypt's block queue_limits stacking to not truncate logic_block_size. - Add Documentation for DM integrity's status line. - Switch DMDEBUG from a compile time config option to instead use dynamic debug via pr_debug. - Fix DM multipath target's hueristic for how it manages "queue_if_no_path" state internally. DM multipath now avoids disabling "queue_if_no_path" unless it is actually needed (e.g. in response to configure timeout or explicit "fail_if_no_path" message). This fixes reports of spurious -EIO being reported back to userspace application during fault tolerance testing with an NVMe backend. Added various dynamic DMDEBUG messages to assist with debugging queue_if_no_path in the future. - Add a new DM multipath "Historical Service Time" Path Selector. - Fix DM multipath's dm_blk_ioctl() to switch paths on IO error. - Improve DM writecache target performance by using explicit cache flushing for target's single-threaded usecase and a small cleanup to remove unnecessary test in persistent_memory_claim. - Other small cleanups in DM core, dm-persistent-data, and DM integrity. -----BEGIN PGP SIGNATURE----- iQFHBAABCAAxFiEEJfWUX4UqZ4x1O2wixSPxCi2dA1oFAl7alrgTHHNuaXR6ZXJA cmVkaGF0LmNvbQAKCRDFI/EKLZ0DWl42B/9sBd+j60emy4Bliu/f3pd7SEkFrSXv K2jXicRFx4E5kO0aLK+fX65cOiq2vvLsDh8c++0TLXcD9q7oK0qxK9c8TCPq29Cx W2J2dwdjyyqqbr3/FZHYYM9KLOl5rsxJLygXwhJhQ2Gny44L7nVACrAXNzXIHJ3r f8xr+GLdF/jz7WLj8bwEDo3Bf8wvxDvl2ijqj7EceOhTutNE8xHQ6UcTPqTtozJy sNM8UQNk1L43DBAvXfrKZ+yQ5DYAKdXKJpV9C8qv5DEGbaEikuMrHddgO4KlDdp4 VjPk9GSfPwGcJ4ecN8vgecZVGvh52ZU7OZ8qey/q0zqps74jeHTZQQyM =jRun -----END PGP SIGNATURE----- Merge tag 'for-5.8/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm Pull device mapper updates from Mike Snitzer: - The largest change for this cycle is the DM zoned target's metadata version 2 feature that adds support for pairing regular block devices with a zoned device to ease the performance impact associated with finite random zones of zoned device. The changes came in three batches: the first prepared for and then added the ability to pair a single regular block device, the second was a batch of fixes to improve zoned's reclaim heuristic, and the third removed the limitation of only adding a single additional regular block device to allow many devices. Testing has shown linear scaling as more devices are added. - Add new emulated block size (ebs) target that emulates a smaller logical_block_size than a block device supports The primary use-case is to emulate "512e" devices that have 512 byte logical_block_size and 4KB physical_block_size. This is useful to some legacy applications that otherwise wouldn't be able to be used on 4K devices because they depend on issuing IO in 512 byte granularity. - Add discard interfaces to DM bufio. First consumer of the interface is the dm-ebs target that makes heavy use of dm-bufio. - Fix DM crypt's block queue_limits stacking to not truncate logic_block_size. - Add Documentation for DM integrity's status line. - Switch DMDEBUG from a compile time config option to instead use dynamic debug via pr_debug. - Fix DM multipath target's hueristic for how it manages "queue_if_no_path" state internally. DM multipath now avoids disabling "queue_if_no_path" unless it is actually needed (e.g. in response to configure timeout or explicit "fail_if_no_path" message). This fixes reports of spurious -EIO being reported back to userspace application during fault tolerance testing with an NVMe backend. Added various dynamic DMDEBUG messages to assist with debugging queue_if_no_path in the future. - Add a new DM multipath "Historical Service Time" Path Selector. - Fix DM multipath's dm_blk_ioctl() to switch paths on IO error. - Improve DM writecache target performance by using explicit cache flushing for target's single-threaded usecase and a small cleanup to remove unnecessary test in persistent_memory_claim. - Other small cleanups in DM core, dm-persistent-data, and DM integrity. * tag 'for-5.8/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (62 commits) dm crypt: avoid truncating the logical block size dm mpath: add DM device name to Failing/Reinstating path log messages dm mpath: enhance queue_if_no_path debugging dm mpath: restrict queue_if_no_path state machine dm mpath: simplify __must_push_back dm zoned: check superblock location dm zoned: prefer full zones for reclaim dm zoned: select reclaim zone based on device index dm zoned: allocate zone by device index dm zoned: support arbitrary number of devices dm zoned: move random and sequential zones into struct dmz_dev dm zoned: per-device reclaim dm zoned: add metadata pointer to struct dmz_dev dm zoned: add device pointer to struct dm_zone dm zoned: allocate temporary superblock for tertiary devices dm zoned: convert to xarray dm zoned: add a 'reserved' zone flag dm zoned: improve logging messages for reclaim dm zoned: avoid unnecessary device recalulation for secondary superblock dm zoned: add debugging message for reading superblocks ...
This commit is contained in:
commit
b25c6644bf
51
Documentation/admin-guide/device-mapper/dm-ebs.rst
Normal file
51
Documentation/admin-guide/device-mapper/dm-ebs.rst
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
======
|
||||||
|
dm-ebs
|
||||||
|
======
|
||||||
|
|
||||||
|
|
||||||
|
This target is similar to the linear target except that it emulates
|
||||||
|
a smaller logical block size on a device with a larger logical block
|
||||||
|
size. Its main purpose is to provide emulation of 512 byte sectors on
|
||||||
|
devices that do not provide this emulation (i.e. 4K native disks).
|
||||||
|
|
||||||
|
Supported emulated logical block sizes 512, 1024, 2048 and 4096.
|
||||||
|
|
||||||
|
Underlying block size can be set to > 4K to test buffering larger units.
|
||||||
|
|
||||||
|
|
||||||
|
Table parameters
|
||||||
|
----------------
|
||||||
|
<dev path> <offset> <emulated sectors> [<underlying sectors>]
|
||||||
|
|
||||||
|
Mandatory parameters:
|
||||||
|
|
||||||
|
<dev path>:
|
||||||
|
Full pathname to the underlying block-device,
|
||||||
|
or a "major:minor" device-number.
|
||||||
|
<offset>:
|
||||||
|
Starting sector within the device;
|
||||||
|
has to be a multiple of <emulated sectors>.
|
||||||
|
<emulated sectors>:
|
||||||
|
Number of sectors defining the logical block size to be emulated;
|
||||||
|
1, 2, 4, 8 sectors of 512 bytes supported.
|
||||||
|
|
||||||
|
Optional parameter:
|
||||||
|
|
||||||
|
<underyling sectors>:
|
||||||
|
Number of sectors defining the logical block size of <dev path>.
|
||||||
|
2^N supported, e.g. 8 = emulate 8 sectors of 512 bytes = 4KiB.
|
||||||
|
If not provided, the logical block size of <dev path> will be used.
|
||||||
|
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
Emulate 1 sector = 512 bytes logical block size on /dev/sda starting at
|
||||||
|
offset 1024 sectors with underlying devices block size automatically set:
|
||||||
|
|
||||||
|
ebs /dev/sda 1024 1
|
||||||
|
|
||||||
|
Emulate 2 sector = 1KiB logical block size on /dev/sda starting at
|
||||||
|
offset 128 sectors, enforce 2KiB underlying device block size.
|
||||||
|
This presumes 2KiB logical blocksize on /dev/sda or less to work:
|
||||||
|
|
||||||
|
ebs /dev/sda 128 2 4
|
@ -193,6 +193,14 @@ should not be changed when reloading the target because the layout of disk
|
|||||||
data depend on them and the reloaded target would be non-functional.
|
data depend on them and the reloaded target would be non-functional.
|
||||||
|
|
||||||
|
|
||||||
|
Status line:
|
||||||
|
|
||||||
|
1. the number of integrity mismatches
|
||||||
|
2. provided data sectors - that is the number of sectors that the user
|
||||||
|
could use
|
||||||
|
3. the current recalculating position (or '-' if we didn't recalculate)
|
||||||
|
|
||||||
|
|
||||||
The layout of the formatted block device:
|
The layout of the formatted block device:
|
||||||
|
|
||||||
* reserved sectors
|
* reserved sectors
|
||||||
|
@ -37,9 +37,13 @@ Algorithm
|
|||||||
dm-zoned implements an on-disk buffering scheme to handle non-sequential
|
dm-zoned implements an on-disk buffering scheme to handle non-sequential
|
||||||
write accesses to the sequential zones of a zoned block device.
|
write accesses to the sequential zones of a zoned block device.
|
||||||
Conventional zones are used for caching as well as for storing internal
|
Conventional zones are used for caching as well as for storing internal
|
||||||
metadata.
|
metadata. It can also use a regular block device together with the zoned
|
||||||
|
block device; in that case the regular block device will be split logically
|
||||||
|
in zones with the same size as the zoned block device. These zones will be
|
||||||
|
placed in front of the zones from the zoned block device and will be handled
|
||||||
|
just like conventional zones.
|
||||||
|
|
||||||
The zones of the device are separated into 2 types:
|
The zones of the device(s) are separated into 2 types:
|
||||||
|
|
||||||
1) Metadata zones: these are conventional zones used to store metadata.
|
1) Metadata zones: these are conventional zones used to store metadata.
|
||||||
Metadata zones are not reported as useable capacity to the user.
|
Metadata zones are not reported as useable capacity to the user.
|
||||||
@ -127,6 +131,13 @@ resumed. Flushing metadata thus only temporarily delays write and
|
|||||||
discard requests. Read requests can be processed concurrently while
|
discard requests. Read requests can be processed concurrently while
|
||||||
metadata flush is being executed.
|
metadata flush is being executed.
|
||||||
|
|
||||||
|
If a regular device is used in conjunction with the zoned block device,
|
||||||
|
a third set of metadata (without the zone bitmaps) is written to the
|
||||||
|
start of the zoned block device. This metadata has a generation counter of
|
||||||
|
'0' and will never be updated during normal operation; it just serves for
|
||||||
|
identification purposes. The first and second copy of the metadata
|
||||||
|
are located at the start of the regular block device.
|
||||||
|
|
||||||
Usage
|
Usage
|
||||||
=====
|
=====
|
||||||
|
|
||||||
@ -138,9 +149,46 @@ Ex::
|
|||||||
|
|
||||||
dmzadm --format /dev/sdxx
|
dmzadm --format /dev/sdxx
|
||||||
|
|
||||||
For a formatted device, the target can be created normally with the
|
|
||||||
dmsetup utility. The only parameter that dm-zoned requires is the
|
|
||||||
underlying zoned block device name. Ex::
|
|
||||||
|
|
||||||
echo "0 `blockdev --getsize ${dev}` zoned ${dev}" | \
|
If two drives are to be used, both devices must be specified, with the
|
||||||
dmsetup create dmz-`basename ${dev}`
|
regular block device as the first device.
|
||||||
|
|
||||||
|
Ex::
|
||||||
|
|
||||||
|
dmzadm --format /dev/sdxx /dev/sdyy
|
||||||
|
|
||||||
|
|
||||||
|
Fomatted device(s) can be started with the dmzadm utility, too.:
|
||||||
|
|
||||||
|
Ex::
|
||||||
|
|
||||||
|
dmzadm --start /dev/sdxx /dev/sdyy
|
||||||
|
|
||||||
|
|
||||||
|
Information about the internal layout and current usage of the zones can
|
||||||
|
be obtained with the 'status' callback from dmsetup:
|
||||||
|
|
||||||
|
Ex::
|
||||||
|
|
||||||
|
dmsetup status /dev/dm-X
|
||||||
|
|
||||||
|
will return a line
|
||||||
|
|
||||||
|
0 <size> zoned <nr_zones> zones <nr_unmap_rnd>/<nr_rnd> random <nr_unmap_seq>/<nr_seq> sequential
|
||||||
|
|
||||||
|
where <nr_zones> is the total number of zones, <nr_unmap_rnd> is the number
|
||||||
|
of unmapped (ie free) random zones, <nr_rnd> the total number of zones,
|
||||||
|
<nr_unmap_seq> the number of unmapped sequential zones, and <nr_seq> the
|
||||||
|
total number of sequential zones.
|
||||||
|
|
||||||
|
Normally the reclaim process will be started once there are less than 50
|
||||||
|
percent free random zones. In order to start the reclaim process manually
|
||||||
|
even before reaching this threshold the 'dmsetup message' function can be
|
||||||
|
used:
|
||||||
|
|
||||||
|
Ex::
|
||||||
|
|
||||||
|
dmsetup message /dev/dm-X 0 reclaim
|
||||||
|
|
||||||
|
will start the reclaim process and random zones will be moved to sequential
|
||||||
|
zones.
|
||||||
|
@ -269,6 +269,7 @@ config DM_UNSTRIPED
|
|||||||
config DM_CRYPT
|
config DM_CRYPT
|
||||||
tristate "Crypt target support"
|
tristate "Crypt target support"
|
||||||
depends on BLK_DEV_DM
|
depends on BLK_DEV_DM
|
||||||
|
depends on (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n)
|
||||||
select CRYPTO
|
select CRYPTO
|
||||||
select CRYPTO_CBC
|
select CRYPTO_CBC
|
||||||
select CRYPTO_ESSIV
|
select CRYPTO_ESSIV
|
||||||
@ -336,6 +337,14 @@ config DM_WRITECACHE
|
|||||||
The writecache target doesn't cache reads because reads are supposed
|
The writecache target doesn't cache reads because reads are supposed
|
||||||
to be cached in standard RAM.
|
to be cached in standard RAM.
|
||||||
|
|
||||||
|
config DM_EBS
|
||||||
|
tristate "Emulated block size target (EXPERIMENTAL)"
|
||||||
|
depends on BLK_DEV_DM
|
||||||
|
select DM_BUFIO
|
||||||
|
help
|
||||||
|
dm-ebs emulates smaller logical block size on backing devices
|
||||||
|
with larger ones (e.g. 512 byte sectors on 4K native disks).
|
||||||
|
|
||||||
config DM_ERA
|
config DM_ERA
|
||||||
tristate "Era target (EXPERIMENTAL)"
|
tristate "Era target (EXPERIMENTAL)"
|
||||||
depends on BLK_DEV_DM
|
depends on BLK_DEV_DM
|
||||||
@ -443,6 +452,17 @@ config DM_MULTIPATH_ST
|
|||||||
|
|
||||||
If unsure, say N.
|
If unsure, say N.
|
||||||
|
|
||||||
|
config DM_MULTIPATH_HST
|
||||||
|
tristate "I/O Path Selector based on historical service time"
|
||||||
|
depends on DM_MULTIPATH
|
||||||
|
help
|
||||||
|
This path selector is a dynamic load balancer which selects
|
||||||
|
the path expected to complete the incoming I/O in the shortest
|
||||||
|
time by comparing estimated service time (based on historical
|
||||||
|
service time).
|
||||||
|
|
||||||
|
If unsure, say N.
|
||||||
|
|
||||||
config DM_DELAY
|
config DM_DELAY
|
||||||
tristate "I/O delaying target"
|
tristate "I/O delaying target"
|
||||||
depends on BLK_DEV_DM
|
depends on BLK_DEV_DM
|
||||||
|
@ -17,6 +17,7 @@ dm-thin-pool-y += dm-thin.o dm-thin-metadata.o
|
|||||||
dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \
|
dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \
|
||||||
dm-cache-background-tracker.o
|
dm-cache-background-tracker.o
|
||||||
dm-cache-smq-y += dm-cache-policy-smq.o
|
dm-cache-smq-y += dm-cache-policy-smq.o
|
||||||
|
dm-ebs-y += dm-ebs-target.o
|
||||||
dm-era-y += dm-era-target.o
|
dm-era-y += dm-era-target.o
|
||||||
dm-clone-y += dm-clone-target.o dm-clone-metadata.o
|
dm-clone-y += dm-clone-target.o dm-clone-metadata.o
|
||||||
dm-verity-y += dm-verity-target.o
|
dm-verity-y += dm-verity-target.o
|
||||||
@ -54,6 +55,7 @@ obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o
|
|||||||
obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
|
obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
|
||||||
obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o
|
obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o
|
||||||
obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o
|
obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o
|
||||||
|
obj-$(CONFIG_DM_MULTIPATH_HST) += dm-historical-service-time.o
|
||||||
obj-$(CONFIG_DM_SWITCH) += dm-switch.o
|
obj-$(CONFIG_DM_SWITCH) += dm-switch.o
|
||||||
obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
|
obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
|
||||||
obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/
|
obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/
|
||||||
@ -65,6 +67,7 @@ obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o
|
|||||||
obj-$(CONFIG_DM_VERITY) += dm-verity.o
|
obj-$(CONFIG_DM_VERITY) += dm-verity.o
|
||||||
obj-$(CONFIG_DM_CACHE) += dm-cache.o
|
obj-$(CONFIG_DM_CACHE) += dm-cache.o
|
||||||
obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o
|
obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o
|
||||||
|
obj-$(CONFIG_DM_EBS) += dm-ebs.o
|
||||||
obj-$(CONFIG_DM_ERA) += dm-era.o
|
obj-$(CONFIG_DM_ERA) += dm-era.o
|
||||||
obj-$(CONFIG_DM_CLONE) += dm-clone.o
|
obj-$(CONFIG_DM_CLONE) += dm-clone.o
|
||||||
obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o
|
obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o
|
||||||
|
@ -256,12 +256,35 @@ static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
|
|||||||
if (b->block == block)
|
if (b->block == block)
|
||||||
return b;
|
return b;
|
||||||
|
|
||||||
n = (b->block < block) ? n->rb_left : n->rb_right;
|
n = block < b->block ? n->rb_left : n->rb_right;
|
||||||
}
|
}
|
||||||
|
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static struct dm_buffer *__find_next(struct dm_bufio_client *c, sector_t block)
|
||||||
|
{
|
||||||
|
struct rb_node *n = c->buffer_tree.rb_node;
|
||||||
|
struct dm_buffer *b;
|
||||||
|
struct dm_buffer *best = NULL;
|
||||||
|
|
||||||
|
while (n) {
|
||||||
|
b = container_of(n, struct dm_buffer, node);
|
||||||
|
|
||||||
|
if (b->block == block)
|
||||||
|
return b;
|
||||||
|
|
||||||
|
if (block <= b->block) {
|
||||||
|
n = n->rb_left;
|
||||||
|
best = b;
|
||||||
|
} else {
|
||||||
|
n = n->rb_right;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return best;
|
||||||
|
}
|
||||||
|
|
||||||
static void __insert(struct dm_bufio_client *c, struct dm_buffer *b)
|
static void __insert(struct dm_bufio_client *c, struct dm_buffer *b)
|
||||||
{
|
{
|
||||||
struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL;
|
struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL;
|
||||||
@ -276,8 +299,8 @@ static void __insert(struct dm_bufio_client *c, struct dm_buffer *b)
|
|||||||
}
|
}
|
||||||
|
|
||||||
parent = *new;
|
parent = *new;
|
||||||
new = (found->block < b->block) ?
|
new = b->block < found->block ?
|
||||||
&((*new)->rb_left) : &((*new)->rb_right);
|
&found->node.rb_left : &found->node.rb_right;
|
||||||
}
|
}
|
||||||
|
|
||||||
rb_link_node(&b->node, parent, new);
|
rb_link_node(&b->node, parent, new);
|
||||||
@ -631,6 +654,19 @@ dmio:
|
|||||||
submit_bio(bio);
|
submit_bio(bio);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline sector_t block_to_sector(struct dm_bufio_client *c, sector_t block)
|
||||||
|
{
|
||||||
|
sector_t sector;
|
||||||
|
|
||||||
|
if (likely(c->sectors_per_block_bits >= 0))
|
||||||
|
sector = block << c->sectors_per_block_bits;
|
||||||
|
else
|
||||||
|
sector = block * (c->block_size >> SECTOR_SHIFT);
|
||||||
|
sector += c->start;
|
||||||
|
|
||||||
|
return sector;
|
||||||
|
}
|
||||||
|
|
||||||
static void submit_io(struct dm_buffer *b, int rw, void (*end_io)(struct dm_buffer *, blk_status_t))
|
static void submit_io(struct dm_buffer *b, int rw, void (*end_io)(struct dm_buffer *, blk_status_t))
|
||||||
{
|
{
|
||||||
unsigned n_sectors;
|
unsigned n_sectors;
|
||||||
@ -639,11 +675,7 @@ static void submit_io(struct dm_buffer *b, int rw, void (*end_io)(struct dm_buff
|
|||||||
|
|
||||||
b->end_io = end_io;
|
b->end_io = end_io;
|
||||||
|
|
||||||
if (likely(b->c->sectors_per_block_bits >= 0))
|
sector = block_to_sector(b->c, b->block);
|
||||||
sector = b->block << b->c->sectors_per_block_bits;
|
|
||||||
else
|
|
||||||
sector = b->block * (b->c->block_size >> SECTOR_SHIFT);
|
|
||||||
sector += b->c->start;
|
|
||||||
|
|
||||||
if (rw != REQ_OP_WRITE) {
|
if (rw != REQ_OP_WRITE) {
|
||||||
n_sectors = b->c->block_size >> SECTOR_SHIFT;
|
n_sectors = b->c->block_size >> SECTOR_SHIFT;
|
||||||
@ -1325,6 +1357,30 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c)
|
|||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
|
EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Use dm-io to send a discard request to flush the device.
|
||||||
|
*/
|
||||||
|
int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count)
|
||||||
|
{
|
||||||
|
struct dm_io_request io_req = {
|
||||||
|
.bi_op = REQ_OP_DISCARD,
|
||||||
|
.bi_op_flags = REQ_SYNC,
|
||||||
|
.mem.type = DM_IO_KMEM,
|
||||||
|
.mem.ptr.addr = NULL,
|
||||||
|
.client = c->dm_io,
|
||||||
|
};
|
||||||
|
struct dm_io_region io_reg = {
|
||||||
|
.bdev = c->bdev,
|
||||||
|
.sector = block_to_sector(c, block),
|
||||||
|
.count = block_to_sector(c, count),
|
||||||
|
};
|
||||||
|
|
||||||
|
BUG_ON(dm_bufio_in_request());
|
||||||
|
|
||||||
|
return dm_io(&io_req, 1, &io_reg, NULL);
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(dm_bufio_issue_discard);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We first delete any other buffer that may be at that new location.
|
* We first delete any other buffer that may be at that new location.
|
||||||
*
|
*
|
||||||
@ -1401,6 +1457,14 @@ retry:
|
|||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(dm_bufio_release_move);
|
EXPORT_SYMBOL_GPL(dm_bufio_release_move);
|
||||||
|
|
||||||
|
static void forget_buffer_locked(struct dm_buffer *b)
|
||||||
|
{
|
||||||
|
if (likely(!b->hold_count) && likely(!b->state)) {
|
||||||
|
__unlink_buffer(b);
|
||||||
|
__free_buffer_wake(b);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Free the given buffer.
|
* Free the given buffer.
|
||||||
*
|
*
|
||||||
@ -1414,15 +1478,36 @@ void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
|
|||||||
dm_bufio_lock(c);
|
dm_bufio_lock(c);
|
||||||
|
|
||||||
b = __find(c, block);
|
b = __find(c, block);
|
||||||
if (b && likely(!b->hold_count) && likely(!b->state)) {
|
if (b)
|
||||||
__unlink_buffer(b);
|
forget_buffer_locked(b);
|
||||||
__free_buffer_wake(b);
|
|
||||||
}
|
|
||||||
|
|
||||||
dm_bufio_unlock(c);
|
dm_bufio_unlock(c);
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(dm_bufio_forget);
|
EXPORT_SYMBOL_GPL(dm_bufio_forget);
|
||||||
|
|
||||||
|
void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks)
|
||||||
|
{
|
||||||
|
struct dm_buffer *b;
|
||||||
|
sector_t end_block = block + n_blocks;
|
||||||
|
|
||||||
|
while (block < end_block) {
|
||||||
|
dm_bufio_lock(c);
|
||||||
|
|
||||||
|
b = __find_next(c, block);
|
||||||
|
if (b) {
|
||||||
|
block = b->block + 1;
|
||||||
|
forget_buffer_locked(b);
|
||||||
|
}
|
||||||
|
|
||||||
|
dm_bufio_unlock(c);
|
||||||
|
|
||||||
|
if (!b)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(dm_bufio_forget_buffers);
|
||||||
|
|
||||||
void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n)
|
void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n)
|
||||||
{
|
{
|
||||||
c->minimum_buffers = n;
|
c->minimum_buffers = n;
|
||||||
|
@ -34,7 +34,9 @@
|
|||||||
#include <crypto/aead.h>
|
#include <crypto/aead.h>
|
||||||
#include <crypto/authenc.h>
|
#include <crypto/authenc.h>
|
||||||
#include <linux/rtnetlink.h> /* for struct rtattr and RTA macros only */
|
#include <linux/rtnetlink.h> /* for struct rtattr and RTA macros only */
|
||||||
|
#include <linux/key-type.h>
|
||||||
#include <keys/user-type.h>
|
#include <keys/user-type.h>
|
||||||
|
#include <keys/encrypted-type.h>
|
||||||
|
|
||||||
#include <linux/device-mapper.h>
|
#include <linux/device-mapper.h>
|
||||||
|
|
||||||
@ -212,7 +214,7 @@ struct crypt_config {
|
|||||||
struct mutex bio_alloc_lock;
|
struct mutex bio_alloc_lock;
|
||||||
|
|
||||||
u8 *authenc_key; /* space for keys in authenc() format (if used) */
|
u8 *authenc_key; /* space for keys in authenc() format (if used) */
|
||||||
u8 key[0];
|
u8 key[];
|
||||||
};
|
};
|
||||||
|
|
||||||
#define MIN_IOS 64
|
#define MIN_IOS 64
|
||||||
@ -2215,12 +2217,47 @@ static bool contains_whitespace(const char *str)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int set_key_user(struct crypt_config *cc, struct key *key)
|
||||||
|
{
|
||||||
|
const struct user_key_payload *ukp;
|
||||||
|
|
||||||
|
ukp = user_key_payload_locked(key);
|
||||||
|
if (!ukp)
|
||||||
|
return -EKEYREVOKED;
|
||||||
|
|
||||||
|
if (cc->key_size != ukp->datalen)
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
|
memcpy(cc->key, ukp->data, cc->key_size);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(CONFIG_ENCRYPTED_KEYS) || defined(CONFIG_ENCRYPTED_KEYS_MODULE)
|
||||||
|
static int set_key_encrypted(struct crypt_config *cc, struct key *key)
|
||||||
|
{
|
||||||
|
const struct encrypted_key_payload *ekp;
|
||||||
|
|
||||||
|
ekp = key->payload.data[0];
|
||||||
|
if (!ekp)
|
||||||
|
return -EKEYREVOKED;
|
||||||
|
|
||||||
|
if (cc->key_size != ekp->decrypted_datalen)
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
|
memcpy(cc->key, ekp->decrypted_data, cc->key_size);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
#endif /* CONFIG_ENCRYPTED_KEYS */
|
||||||
|
|
||||||
static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string)
|
static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string)
|
||||||
{
|
{
|
||||||
char *new_key_string, *key_desc;
|
char *new_key_string, *key_desc;
|
||||||
int ret;
|
int ret;
|
||||||
|
struct key_type *type;
|
||||||
struct key *key;
|
struct key *key;
|
||||||
const struct user_key_payload *ukp;
|
int (*set_key)(struct crypt_config *cc, struct key *key);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Reject key_string with whitespace. dm core currently lacks code for
|
* Reject key_string with whitespace. dm core currently lacks code for
|
||||||
@ -2236,16 +2273,26 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string
|
|||||||
if (!key_desc || key_desc == key_string || !strlen(key_desc + 1))
|
if (!key_desc || key_desc == key_string || !strlen(key_desc + 1))
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
|
||||||
if (strncmp(key_string, "logon:", key_desc - key_string + 1) &&
|
if (!strncmp(key_string, "logon:", key_desc - key_string + 1)) {
|
||||||
strncmp(key_string, "user:", key_desc - key_string + 1))
|
type = &key_type_logon;
|
||||||
|
set_key = set_key_user;
|
||||||
|
} else if (!strncmp(key_string, "user:", key_desc - key_string + 1)) {
|
||||||
|
type = &key_type_user;
|
||||||
|
set_key = set_key_user;
|
||||||
|
#if defined(CONFIG_ENCRYPTED_KEYS) || defined(CONFIG_ENCRYPTED_KEYS_MODULE)
|
||||||
|
} else if (!strncmp(key_string, "encrypted:", key_desc - key_string + 1)) {
|
||||||
|
type = &key_type_encrypted;
|
||||||
|
set_key = set_key_encrypted;
|
||||||
|
#endif
|
||||||
|
} else {
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
|
||||||
new_key_string = kstrdup(key_string, GFP_KERNEL);
|
new_key_string = kstrdup(key_string, GFP_KERNEL);
|
||||||
if (!new_key_string)
|
if (!new_key_string)
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
|
|
||||||
key = request_key(key_string[0] == 'l' ? &key_type_logon : &key_type_user,
|
key = request_key(type, key_desc + 1, NULL);
|
||||||
key_desc + 1, NULL);
|
|
||||||
if (IS_ERR(key)) {
|
if (IS_ERR(key)) {
|
||||||
kzfree(new_key_string);
|
kzfree(new_key_string);
|
||||||
return PTR_ERR(key);
|
return PTR_ERR(key);
|
||||||
@ -2253,23 +2300,14 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string
|
|||||||
|
|
||||||
down_read(&key->sem);
|
down_read(&key->sem);
|
||||||
|
|
||||||
ukp = user_key_payload_locked(key);
|
ret = set_key(cc, key);
|
||||||
if (!ukp) {
|
if (ret < 0) {
|
||||||
up_read(&key->sem);
|
up_read(&key->sem);
|
||||||
key_put(key);
|
key_put(key);
|
||||||
kzfree(new_key_string);
|
kzfree(new_key_string);
|
||||||
return -EKEYREVOKED;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cc->key_size != ukp->datalen) {
|
|
||||||
up_read(&key->sem);
|
|
||||||
key_put(key);
|
|
||||||
kzfree(new_key_string);
|
|
||||||
return -EINVAL;
|
|
||||||
}
|
|
||||||
|
|
||||||
memcpy(cc->key, ukp->data, cc->key_size);
|
|
||||||
|
|
||||||
up_read(&key->sem);
|
up_read(&key->sem);
|
||||||
key_put(key);
|
key_put(key);
|
||||||
|
|
||||||
@ -2323,7 +2361,7 @@ static int get_key_size(char **key_string)
|
|||||||
return (*key_string[0] == ':') ? -EINVAL : strlen(*key_string) >> 1;
|
return (*key_string[0] == ':') ? -EINVAL : strlen(*key_string) >> 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif /* CONFIG_KEYS */
|
||||||
|
|
||||||
static int crypt_set_key(struct crypt_config *cc, char *key)
|
static int crypt_set_key(struct crypt_config *cc, char *key)
|
||||||
{
|
{
|
||||||
@ -3274,7 +3312,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
|
|||||||
limits->max_segment_size = PAGE_SIZE;
|
limits->max_segment_size = PAGE_SIZE;
|
||||||
|
|
||||||
limits->logical_block_size =
|
limits->logical_block_size =
|
||||||
max_t(unsigned short, limits->logical_block_size, cc->sector_size);
|
max_t(unsigned, limits->logical_block_size, cc->sector_size);
|
||||||
limits->physical_block_size =
|
limits->physical_block_size =
|
||||||
max_t(unsigned, limits->physical_block_size, cc->sector_size);
|
max_t(unsigned, limits->physical_block_size, cc->sector_size);
|
||||||
limits->io_min = max_t(unsigned, limits->io_min, cc->sector_size);
|
limits->io_min = max_t(unsigned, limits->io_min, cc->sector_size);
|
||||||
@ -3282,7 +3320,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
|
|||||||
|
|
||||||
static struct target_type crypt_target = {
|
static struct target_type crypt_target = {
|
||||||
.name = "crypt",
|
.name = "crypt",
|
||||||
.version = {1, 20, 0},
|
.version = {1, 21, 0},
|
||||||
.module = THIS_MODULE,
|
.module = THIS_MODULE,
|
||||||
.ctr = crypt_ctr,
|
.ctr = crypt_ctr,
|
||||||
.dtr = crypt_dtr,
|
.dtr = crypt_dtr,
|
||||||
|
471
drivers/md/dm-ebs-target.c
Normal file
471
drivers/md/dm-ebs-target.c
Normal file
@ -0,0 +1,471 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2020 Red Hat GmbH
|
||||||
|
*
|
||||||
|
* This file is released under the GPL.
|
||||||
|
*
|
||||||
|
* Device-mapper target to emulate smaller logical block
|
||||||
|
* size on backing devices exposing (natively) larger ones.
|
||||||
|
*
|
||||||
|
* E.g. 512 byte sector emulation on 4K native disks.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "dm.h"
|
||||||
|
#include <linux/module.h>
|
||||||
|
#include <linux/workqueue.h>
|
||||||
|
#include <linux/dm-bufio.h>
|
||||||
|
|
||||||
|
#define DM_MSG_PREFIX "ebs"
|
||||||
|
|
||||||
|
static void ebs_dtr(struct dm_target *ti);
|
||||||
|
|
||||||
|
/* Emulated block size context. */
|
||||||
|
struct ebs_c {
|
||||||
|
struct dm_dev *dev; /* Underlying device to emulate block size on. */
|
||||||
|
struct dm_bufio_client *bufio; /* Use dm-bufio for read and read-modify-write processing. */
|
||||||
|
struct workqueue_struct *wq; /* Workqueue for ^ processing of bios. */
|
||||||
|
struct work_struct ws; /* Work item used for ^. */
|
||||||
|
struct bio_list bios_in; /* Worker bios input list. */
|
||||||
|
spinlock_t lock; /* Guard bios input list above. */
|
||||||
|
sector_t start; /* <start> table line argument, see ebs_ctr below. */
|
||||||
|
unsigned int e_bs; /* Emulated block size in sectors exposed to upper layer. */
|
||||||
|
unsigned int u_bs; /* Underlying block size in sectors retrievd from/set on lower layer device. */
|
||||||
|
unsigned char block_shift; /* bitshift sectors -> blocks used in dm-bufio API. */
|
||||||
|
bool u_bs_set:1; /* Flag to indicate underlying block size is set on table line. */
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline sector_t __sector_to_block(struct ebs_c *ec, sector_t sector)
|
||||||
|
{
|
||||||
|
return sector >> ec->block_shift;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline sector_t __block_mod(sector_t sector, unsigned int bs)
|
||||||
|
{
|
||||||
|
return sector & (bs - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Return number of blocks for a bio, accounting for misalignement of start and end sectors. */
|
||||||
|
static inline unsigned int __nr_blocks(struct ebs_c *ec, struct bio *bio)
|
||||||
|
{
|
||||||
|
sector_t end_sector = __block_mod(bio->bi_iter.bi_sector, ec->u_bs) + bio_sectors(bio);
|
||||||
|
|
||||||
|
return __sector_to_block(ec, end_sector) + (__block_mod(end_sector, ec->u_bs) ? 1 : 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline bool __ebs_check_bs(unsigned int bs)
|
||||||
|
{
|
||||||
|
return bs && is_power_of_2(bs);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* READ/WRITE:
|
||||||
|
*
|
||||||
|
* copy blocks between bufio blocks and bio vector's (partial/overlapping) pages.
|
||||||
|
*/
|
||||||
|
static int __ebs_rw_bvec(struct ebs_c *ec, int rw, struct bio_vec *bv, struct bvec_iter *iter)
|
||||||
|
{
|
||||||
|
int r = 0;
|
||||||
|
unsigned char *ba, *pa;
|
||||||
|
unsigned int cur_len;
|
||||||
|
unsigned int bv_len = bv->bv_len;
|
||||||
|
unsigned int buf_off = to_bytes(__block_mod(iter->bi_sector, ec->u_bs));
|
||||||
|
sector_t block = __sector_to_block(ec, iter->bi_sector);
|
||||||
|
struct dm_buffer *b;
|
||||||
|
|
||||||
|
if (unlikely(!bv->bv_page || !bv_len))
|
||||||
|
return -EIO;
|
||||||
|
|
||||||
|
pa = page_address(bv->bv_page) + bv->bv_offset;
|
||||||
|
|
||||||
|
/* Handle overlapping page <-> blocks */
|
||||||
|
while (bv_len) {
|
||||||
|
cur_len = min(dm_bufio_get_block_size(ec->bufio) - buf_off, bv_len);
|
||||||
|
|
||||||
|
/* Avoid reading for writes in case bio vector's page overwrites block completely. */
|
||||||
|
if (rw == READ || buf_off || bv_len < dm_bufio_get_block_size(ec->bufio))
|
||||||
|
ba = dm_bufio_read(ec->bufio, block, &b);
|
||||||
|
else
|
||||||
|
ba = dm_bufio_new(ec->bufio, block, &b);
|
||||||
|
|
||||||
|
if (unlikely(IS_ERR(ba))) {
|
||||||
|
/*
|
||||||
|
* Carry on with next buffer, if any, to issue all possible
|
||||||
|
* data but return error.
|
||||||
|
*/
|
||||||
|
r = PTR_ERR(ba);
|
||||||
|
} else {
|
||||||
|
/* Copy data to/from bio to buffer if read/new was successful above. */
|
||||||
|
ba += buf_off;
|
||||||
|
if (rw == READ) {
|
||||||
|
memcpy(pa, ba, cur_len);
|
||||||
|
flush_dcache_page(bv->bv_page);
|
||||||
|
} else {
|
||||||
|
flush_dcache_page(bv->bv_page);
|
||||||
|
memcpy(ba, pa, cur_len);
|
||||||
|
dm_bufio_mark_partial_buffer_dirty(b, buf_off, buf_off + cur_len);
|
||||||
|
}
|
||||||
|
|
||||||
|
dm_bufio_release(b);
|
||||||
|
}
|
||||||
|
|
||||||
|
pa += cur_len;
|
||||||
|
bv_len -= cur_len;
|
||||||
|
buf_off = 0;
|
||||||
|
block++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* READ/WRITE: iterate bio vector's copying between (partial) pages and bufio blocks. */
|
||||||
|
static int __ebs_rw_bio(struct ebs_c *ec, int rw, struct bio *bio)
|
||||||
|
{
|
||||||
|
int r = 0, rr;
|
||||||
|
struct bio_vec bv;
|
||||||
|
struct bvec_iter iter;
|
||||||
|
|
||||||
|
bio_for_each_bvec(bv, bio, iter) {
|
||||||
|
rr = __ebs_rw_bvec(ec, rw, &bv, &iter);
|
||||||
|
if (rr)
|
||||||
|
r = rr;
|
||||||
|
}
|
||||||
|
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Discard bio's blocks, i.e. pass discards down.
|
||||||
|
*
|
||||||
|
* Avoid discarding partial blocks at beginning and end;
|
||||||
|
* return 0 in case no blocks can be discarded as a result.
|
||||||
|
*/
|
||||||
|
static int __ebs_discard_bio(struct ebs_c *ec, struct bio *bio)
|
||||||
|
{
|
||||||
|
sector_t block, blocks, sector = bio->bi_iter.bi_sector;
|
||||||
|
|
||||||
|
block = __sector_to_block(ec, sector);
|
||||||
|
blocks = __nr_blocks(ec, bio);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Partial first underlying block (__nr_blocks() may have
|
||||||
|
* resulted in one block).
|
||||||
|
*/
|
||||||
|
if (__block_mod(sector, ec->u_bs)) {
|
||||||
|
block++;
|
||||||
|
blocks--;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Partial last underlying block if any. */
|
||||||
|
if (blocks && __block_mod(bio_end_sector(bio), ec->u_bs))
|
||||||
|
blocks--;
|
||||||
|
|
||||||
|
return blocks ? dm_bufio_issue_discard(ec->bufio, block, blocks) : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Release blocks them from the bufio cache. */
|
||||||
|
static void __ebs_forget_bio(struct ebs_c *ec, struct bio *bio)
|
||||||
|
{
|
||||||
|
sector_t blocks, sector = bio->bi_iter.bi_sector;
|
||||||
|
|
||||||
|
blocks = __nr_blocks(ec, bio);
|
||||||
|
|
||||||
|
dm_bufio_forget_buffers(ec->bufio, __sector_to_block(ec, sector), blocks);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Worker funtion to process incoming bios. */
|
||||||
|
static void __ebs_process_bios(struct work_struct *ws)
|
||||||
|
{
|
||||||
|
int r;
|
||||||
|
bool write = false;
|
||||||
|
sector_t block1, block2;
|
||||||
|
struct ebs_c *ec = container_of(ws, struct ebs_c, ws);
|
||||||
|
struct bio *bio;
|
||||||
|
struct bio_list bios;
|
||||||
|
|
||||||
|
bio_list_init(&bios);
|
||||||
|
|
||||||
|
spin_lock_irq(&ec->lock);
|
||||||
|
bios = ec->bios_in;
|
||||||
|
bio_list_init(&ec->bios_in);
|
||||||
|
spin_unlock_irq(&ec->lock);
|
||||||
|
|
||||||
|
/* Prefetch all read and any mis-aligned write buffers */
|
||||||
|
bio_list_for_each(bio, &bios) {
|
||||||
|
block1 = __sector_to_block(ec, bio->bi_iter.bi_sector);
|
||||||
|
if (bio_op(bio) == REQ_OP_READ)
|
||||||
|
dm_bufio_prefetch(ec->bufio, block1, __nr_blocks(ec, bio));
|
||||||
|
else if (bio_op(bio) == REQ_OP_WRITE && !(bio->bi_opf & REQ_PREFLUSH)) {
|
||||||
|
block2 = __sector_to_block(ec, bio_end_sector(bio));
|
||||||
|
if (__block_mod(bio->bi_iter.bi_sector, ec->u_bs))
|
||||||
|
dm_bufio_prefetch(ec->bufio, block1, 1);
|
||||||
|
if (__block_mod(bio_end_sector(bio), ec->u_bs) && block2 != block1)
|
||||||
|
dm_bufio_prefetch(ec->bufio, block2, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bio_list_for_each(bio, &bios) {
|
||||||
|
r = -EIO;
|
||||||
|
if (bio_op(bio) == REQ_OP_READ)
|
||||||
|
r = __ebs_rw_bio(ec, READ, bio);
|
||||||
|
else if (bio_op(bio) == REQ_OP_WRITE) {
|
||||||
|
write = true;
|
||||||
|
r = __ebs_rw_bio(ec, WRITE, bio);
|
||||||
|
} else if (bio_op(bio) == REQ_OP_DISCARD) {
|
||||||
|
__ebs_forget_bio(ec, bio);
|
||||||
|
r = __ebs_discard_bio(ec, bio);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (r < 0)
|
||||||
|
bio->bi_status = errno_to_blk_status(r);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We write dirty buffers after processing I/O on them
|
||||||
|
* but before we endio thus addressing REQ_FUA/REQ_SYNC.
|
||||||
|
*/
|
||||||
|
r = write ? dm_bufio_write_dirty_buffers(ec->bufio) : 0;
|
||||||
|
|
||||||
|
while ((bio = bio_list_pop(&bios))) {
|
||||||
|
/* Any other request is endioed. */
|
||||||
|
if (unlikely(r && bio_op(bio) == REQ_OP_WRITE))
|
||||||
|
bio_io_error(bio);
|
||||||
|
else
|
||||||
|
bio_endio(bio);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Construct an emulated block size mapping: <dev_path> <offset> <ebs> [<ubs>]
|
||||||
|
*
|
||||||
|
* <dev_path>: path of the underlying device
|
||||||
|
* <offset>: offset in 512 bytes sectors into <dev_path>
|
||||||
|
* <ebs>: emulated block size in units of 512 bytes exposed to the upper layer
|
||||||
|
* [<ubs>]: underlying block size in units of 512 bytes imposed on the lower layer;
|
||||||
|
* optional, if not supplied, retrieve logical block size from underlying device
|
||||||
|
*/
|
||||||
|
static int ebs_ctr(struct dm_target *ti, unsigned int argc, char **argv)
|
||||||
|
{
|
||||||
|
int r;
|
||||||
|
unsigned short tmp1;
|
||||||
|
unsigned long long tmp;
|
||||||
|
char dummy;
|
||||||
|
struct ebs_c *ec;
|
||||||
|
|
||||||
|
if (argc < 3 || argc > 4) {
|
||||||
|
ti->error = "Invalid argument count";
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
|
||||||
|
ec = ti->private = kzalloc(sizeof(*ec), GFP_KERNEL);
|
||||||
|
if (!ec) {
|
||||||
|
ti->error = "Cannot allocate ebs context";
|
||||||
|
return -ENOMEM;
|
||||||
|
}
|
||||||
|
|
||||||
|
r = -EINVAL;
|
||||||
|
if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1 ||
|
||||||
|
tmp != (sector_t)tmp ||
|
||||||
|
(sector_t)tmp >= ti->len) {
|
||||||
|
ti->error = "Invalid device offset sector";
|
||||||
|
goto bad;
|
||||||
|
}
|
||||||
|
ec->start = tmp;
|
||||||
|
|
||||||
|
if (sscanf(argv[2], "%hu%c", &tmp1, &dummy) != 1 ||
|
||||||
|
!__ebs_check_bs(tmp1) ||
|
||||||
|
to_bytes(tmp1) > PAGE_SIZE) {
|
||||||
|
ti->error = "Invalid emulated block size";
|
||||||
|
goto bad;
|
||||||
|
}
|
||||||
|
ec->e_bs = tmp1;
|
||||||
|
|
||||||
|
if (argc > 3) {
|
||||||
|
if (sscanf(argv[3], "%hu%c", &tmp1, &dummy) != 1 || !__ebs_check_bs(tmp1)) {
|
||||||
|
ti->error = "Invalid underlying block size";
|
||||||
|
goto bad;
|
||||||
|
}
|
||||||
|
ec->u_bs = tmp1;
|
||||||
|
ec->u_bs_set = true;
|
||||||
|
} else
|
||||||
|
ec->u_bs_set = false;
|
||||||
|
|
||||||
|
r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ec->dev);
|
||||||
|
if (r) {
|
||||||
|
ti->error = "Device lookup failed";
|
||||||
|
ec->dev = NULL;
|
||||||
|
goto bad;
|
||||||
|
}
|
||||||
|
|
||||||
|
r = -EINVAL;
|
||||||
|
if (!ec->u_bs_set) {
|
||||||
|
ec->u_bs = to_sector(bdev_logical_block_size(ec->dev->bdev));
|
||||||
|
if (!__ebs_check_bs(ec->u_bs)) {
|
||||||
|
ti->error = "Invalid retrieved underlying block size";
|
||||||
|
goto bad;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!ec->u_bs_set && ec->e_bs == ec->u_bs)
|
||||||
|
DMINFO("Emulation superfluous: emulated equal to underlying block size");
|
||||||
|
|
||||||
|
if (__block_mod(ec->start, ec->u_bs)) {
|
||||||
|
ti->error = "Device offset must be multiple of underlying block size";
|
||||||
|
goto bad;
|
||||||
|
}
|
||||||
|
|
||||||
|
ec->bufio = dm_bufio_client_create(ec->dev->bdev, to_bytes(ec->u_bs), 1, 0, NULL, NULL);
|
||||||
|
if (IS_ERR(ec->bufio)) {
|
||||||
|
ti->error = "Cannot create dm bufio client";
|
||||||
|
r = PTR_ERR(ec->bufio);
|
||||||
|
ec->bufio = NULL;
|
||||||
|
goto bad;
|
||||||
|
}
|
||||||
|
|
||||||
|
ec->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
|
||||||
|
if (!ec->wq) {
|
||||||
|
ti->error = "Cannot create dm-" DM_MSG_PREFIX " workqueue";
|
||||||
|
r = -ENOMEM;
|
||||||
|
goto bad;
|
||||||
|
}
|
||||||
|
|
||||||
|
ec->block_shift = __ffs(ec->u_bs);
|
||||||
|
INIT_WORK(&ec->ws, &__ebs_process_bios);
|
||||||
|
bio_list_init(&ec->bios_in);
|
||||||
|
spin_lock_init(&ec->lock);
|
||||||
|
|
||||||
|
ti->num_flush_bios = 1;
|
||||||
|
ti->num_discard_bios = 1;
|
||||||
|
ti->num_secure_erase_bios = 0;
|
||||||
|
ti->num_write_same_bios = 0;
|
||||||
|
ti->num_write_zeroes_bios = 0;
|
||||||
|
return 0;
|
||||||
|
bad:
|
||||||
|
ebs_dtr(ti);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ebs_dtr(struct dm_target *ti)
|
||||||
|
{
|
||||||
|
struct ebs_c *ec = ti->private;
|
||||||
|
|
||||||
|
if (ec->wq)
|
||||||
|
destroy_workqueue(ec->wq);
|
||||||
|
if (ec->bufio)
|
||||||
|
dm_bufio_client_destroy(ec->bufio);
|
||||||
|
if (ec->dev)
|
||||||
|
dm_put_device(ti, ec->dev);
|
||||||
|
kfree(ec);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int ebs_map(struct dm_target *ti, struct bio *bio)
|
||||||
|
{
|
||||||
|
struct ebs_c *ec = ti->private;
|
||||||
|
|
||||||
|
bio_set_dev(bio, ec->dev->bdev);
|
||||||
|
bio->bi_iter.bi_sector = ec->start + dm_target_offset(ti, bio->bi_iter.bi_sector);
|
||||||
|
|
||||||
|
if (unlikely(bio->bi_opf & REQ_OP_FLUSH))
|
||||||
|
return DM_MAPIO_REMAPPED;
|
||||||
|
/*
|
||||||
|
* Only queue for bufio processing in case of partial or overlapping buffers
|
||||||
|
* -or-
|
||||||
|
* emulation with ebs == ubs aiming for tests of dm-bufio overhead.
|
||||||
|
*/
|
||||||
|
if (likely(__block_mod(bio->bi_iter.bi_sector, ec->u_bs) ||
|
||||||
|
__block_mod(bio_end_sector(bio), ec->u_bs) ||
|
||||||
|
ec->e_bs == ec->u_bs)) {
|
||||||
|
spin_lock_irq(&ec->lock);
|
||||||
|
bio_list_add(&ec->bios_in, bio);
|
||||||
|
spin_unlock_irq(&ec->lock);
|
||||||
|
|
||||||
|
queue_work(ec->wq, &ec->ws);
|
||||||
|
|
||||||
|
return DM_MAPIO_SUBMITTED;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Forget any buffer content relative to this direct backing device I/O. */
|
||||||
|
__ebs_forget_bio(ec, bio);
|
||||||
|
|
||||||
|
return DM_MAPIO_REMAPPED;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ebs_status(struct dm_target *ti, status_type_t type,
|
||||||
|
unsigned status_flags, char *result, unsigned maxlen)
|
||||||
|
{
|
||||||
|
struct ebs_c *ec = ti->private;
|
||||||
|
|
||||||
|
switch (type) {
|
||||||
|
case STATUSTYPE_INFO:
|
||||||
|
*result = '\0';
|
||||||
|
break;
|
||||||
|
case STATUSTYPE_TABLE:
|
||||||
|
snprintf(result, maxlen, ec->u_bs_set ? "%s %llu %u %u" : "%s %llu %u",
|
||||||
|
ec->dev->name, (unsigned long long) ec->start, ec->e_bs, ec->u_bs);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int ebs_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
|
||||||
|
{
|
||||||
|
struct ebs_c *ec = ti->private;
|
||||||
|
struct dm_dev *dev = ec->dev;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Only pass ioctls through if the device sizes match exactly.
|
||||||
|
*/
|
||||||
|
*bdev = dev->bdev;
|
||||||
|
return !!(ec->start || ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ebs_io_hints(struct dm_target *ti, struct queue_limits *limits)
|
||||||
|
{
|
||||||
|
struct ebs_c *ec = ti->private;
|
||||||
|
|
||||||
|
limits->logical_block_size = to_bytes(ec->e_bs);
|
||||||
|
limits->physical_block_size = to_bytes(ec->u_bs);
|
||||||
|
limits->alignment_offset = limits->physical_block_size;
|
||||||
|
blk_limits_io_min(limits, limits->logical_block_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int ebs_iterate_devices(struct dm_target *ti,
|
||||||
|
iterate_devices_callout_fn fn, void *data)
|
||||||
|
{
|
||||||
|
struct ebs_c *ec = ti->private;
|
||||||
|
|
||||||
|
return fn(ti, ec->dev, ec->start, ti->len, data);
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct target_type ebs_target = {
|
||||||
|
.name = "ebs",
|
||||||
|
.version = {1, 0, 1},
|
||||||
|
.features = DM_TARGET_PASSES_INTEGRITY,
|
||||||
|
.module = THIS_MODULE,
|
||||||
|
.ctr = ebs_ctr,
|
||||||
|
.dtr = ebs_dtr,
|
||||||
|
.map = ebs_map,
|
||||||
|
.status = ebs_status,
|
||||||
|
.io_hints = ebs_io_hints,
|
||||||
|
.prepare_ioctl = ebs_prepare_ioctl,
|
||||||
|
.iterate_devices = ebs_iterate_devices,
|
||||||
|
};
|
||||||
|
|
||||||
|
static int __init dm_ebs_init(void)
|
||||||
|
{
|
||||||
|
int r = dm_register_target(&ebs_target);
|
||||||
|
|
||||||
|
if (r < 0)
|
||||||
|
DMERR("register failed %d", r);
|
||||||
|
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void dm_ebs_exit(void)
|
||||||
|
{
|
||||||
|
dm_unregister_target(&ebs_target);
|
||||||
|
}
|
||||||
|
|
||||||
|
module_init(dm_ebs_init);
|
||||||
|
module_exit(dm_ebs_exit);
|
||||||
|
|
||||||
|
MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>");
|
||||||
|
MODULE_DESCRIPTION(DM_NAME " emulated block size target");
|
||||||
|
MODULE_LICENSE("GPL");
|
561
drivers/md/dm-historical-service-time.c
Normal file
561
drivers/md/dm-historical-service-time.c
Normal file
@ -0,0 +1,561 @@
|
|||||||
|
// SPDX-License-Identifier: GPL-2.0
|
||||||
|
/*
|
||||||
|
* Historical Service Time
|
||||||
|
*
|
||||||
|
* Keeps a time-weighted exponential moving average of the historical
|
||||||
|
* service time. Estimates future service time based on the historical
|
||||||
|
* service time and the number of outstanding requests.
|
||||||
|
*
|
||||||
|
* Marks paths stale if they have not finished within hst *
|
||||||
|
* num_paths. If a path is stale and unused, we will send a single
|
||||||
|
* request to probe in case the path has improved. This situation
|
||||||
|
* generally arises if the path is so much worse than others that it
|
||||||
|
* will never have the best estimated service time, or if the entire
|
||||||
|
* multipath device is unused. If a path is stale and in use, limit the
|
||||||
|
* number of requests it can receive with the assumption that the path
|
||||||
|
* has become degraded.
|
||||||
|
*
|
||||||
|
* To avoid repeatedly calculating exponents for time weighting, times
|
||||||
|
* are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT)
|
||||||
|
* ns, and the weighting is pre-calculated.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "dm.h"
|
||||||
|
#include "dm-path-selector.h"
|
||||||
|
|
||||||
|
#include <linux/blkdev.h>
|
||||||
|
#include <linux/slab.h>
|
||||||
|
#include <linux/module.h>
|
||||||
|
|
||||||
|
|
||||||
|
#define DM_MSG_PREFIX "multipath historical-service-time"
|
||||||
|
#define HST_MIN_IO 1
|
||||||
|
#define HST_VERSION "0.1.1"
|
||||||
|
|
||||||
|
#define HST_FIXED_SHIFT 10 /* 10 bits of decimal precision */
|
||||||
|
#define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT)
|
||||||
|
#define HST_FIXED_1 (1 << HST_FIXED_SHIFT)
|
||||||
|
#define HST_FIXED_95 972
|
||||||
|
#define HST_MAX_INFLIGHT HST_FIXED_1
|
||||||
|
#define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */
|
||||||
|
#define HST_WEIGHT_COUNT 64ULL
|
||||||
|
|
||||||
|
struct selector {
|
||||||
|
struct list_head valid_paths;
|
||||||
|
struct list_head failed_paths;
|
||||||
|
int valid_count;
|
||||||
|
spinlock_t lock;
|
||||||
|
|
||||||
|
unsigned int weights[HST_WEIGHT_COUNT];
|
||||||
|
unsigned int threshold_multiplier;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct path_info {
|
||||||
|
struct list_head list;
|
||||||
|
struct dm_path *path;
|
||||||
|
unsigned int repeat_count;
|
||||||
|
|
||||||
|
spinlock_t lock;
|
||||||
|
|
||||||
|
u64 historical_service_time; /* Fixed point */
|
||||||
|
|
||||||
|
u64 stale_after;
|
||||||
|
u64 last_finish;
|
||||||
|
|
||||||
|
u64 outstanding;
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* fixed_power - compute: x^n, in O(log n) time
|
||||||
|
*
|
||||||
|
* @x: base of the power
|
||||||
|
* @frac_bits: fractional bits of @x
|
||||||
|
* @n: power to raise @x to.
|
||||||
|
*
|
||||||
|
* By exploiting the relation between the definition of the natural power
|
||||||
|
* function: x^n := x*x*...*x (x multiplied by itself for n times), and
|
||||||
|
* the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
|
||||||
|
* (where: n_i \elem {0, 1}, the binary vector representing n),
|
||||||
|
* we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
|
||||||
|
* of course trivially computable in O(log_2 n), the length of our binary
|
||||||
|
* vector.
|
||||||
|
*
|
||||||
|
* (see: kernel/sched/loadavg.c)
|
||||||
|
*/
|
||||||
|
static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n)
|
||||||
|
{
|
||||||
|
unsigned long result = 1UL << frac_bits;
|
||||||
|
|
||||||
|
if (n) {
|
||||||
|
for (;;) {
|
||||||
|
if (n & 1) {
|
||||||
|
result *= x;
|
||||||
|
result += 1UL << (frac_bits - 1);
|
||||||
|
result >>= frac_bits;
|
||||||
|
}
|
||||||
|
n >>= 1;
|
||||||
|
if (!n)
|
||||||
|
break;
|
||||||
|
x *= x;
|
||||||
|
x += 1UL << (frac_bits - 1);
|
||||||
|
x >>= frac_bits;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Calculate the next value of an exponential moving average
|
||||||
|
* a_1 = a_0 * e + a * (1 - e)
|
||||||
|
*
|
||||||
|
* @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
|
||||||
|
* @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
|
||||||
|
* @weight: [0, HST_FIXED_1]
|
||||||
|
*
|
||||||
|
* Note:
|
||||||
|
* To account for multiple periods in the same calculation,
|
||||||
|
* a_n = a_0 * e^n + a * (1 - e^n),
|
||||||
|
* so call fixed_ema(last, next, pow(weight, N))
|
||||||
|
*/
|
||||||
|
static u64 fixed_ema(u64 last, u64 next, u64 weight)
|
||||||
|
{
|
||||||
|
last *= weight;
|
||||||
|
last += next * (HST_FIXED_1 - weight);
|
||||||
|
last += 1ULL << (HST_FIXED_SHIFT - 1);
|
||||||
|
return last >> HST_FIXED_SHIFT;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct selector *alloc_selector(void)
|
||||||
|
{
|
||||||
|
struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
|
||||||
|
|
||||||
|
if (s) {
|
||||||
|
INIT_LIST_HEAD(&s->valid_paths);
|
||||||
|
INIT_LIST_HEAD(&s->failed_paths);
|
||||||
|
spin_lock_init(&s->lock);
|
||||||
|
s->valid_count = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Get the weight for a given time span.
|
||||||
|
*/
|
||||||
|
static u64 hst_weight(struct path_selector *ps, u64 delta)
|
||||||
|
{
|
||||||
|
struct selector *s = ps->context;
|
||||||
|
int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL,
|
||||||
|
HST_WEIGHT_COUNT - 1);
|
||||||
|
|
||||||
|
return s->weights[bucket];
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Set up the weights array.
|
||||||
|
*
|
||||||
|
* weights[len-1] = 0
|
||||||
|
* weights[n] = base ^ (n + 1)
|
||||||
|
*/
|
||||||
|
static void hst_set_weights(struct path_selector *ps, unsigned int base)
|
||||||
|
{
|
||||||
|
struct selector *s = ps->context;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
if (base >= HST_FIXED_1)
|
||||||
|
return;
|
||||||
|
|
||||||
|
for (i = 0; i < HST_WEIGHT_COUNT - 1; i++)
|
||||||
|
s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1);
|
||||||
|
s->weights[HST_WEIGHT_COUNT - 1] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int hst_create(struct path_selector *ps, unsigned int argc, char **argv)
|
||||||
|
{
|
||||||
|
struct selector *s;
|
||||||
|
unsigned int base_weight = HST_FIXED_95;
|
||||||
|
unsigned int threshold_multiplier = 0;
|
||||||
|
char dummy;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Arguments: [<base_weight> [<threshold_multiplier>]]
|
||||||
|
* <base_weight>: Base weight for ema [0, 1024) 10-bit fixed point. A
|
||||||
|
* value of 0 will completely ignore any history.
|
||||||
|
* If not given, default (HST_FIXED_95) is used.
|
||||||
|
* <threshold_multiplier>: Minimum threshold multiplier for paths to
|
||||||
|
* be considered different. That is, a path is
|
||||||
|
* considered different iff (p1 > N * p2) where p1
|
||||||
|
* is the path with higher service time. A threshold
|
||||||
|
* of 1 or 0 has no effect. Defaults to 0.
|
||||||
|
*/
|
||||||
|
if (argc > 2)
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
|
if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 ||
|
||||||
|
base_weight >= HST_FIXED_1)) {
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc > 1 && (sscanf(argv[1], "%u%c",
|
||||||
|
&threshold_multiplier, &dummy) != 1)) {
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
|
||||||
|
s = alloc_selector();
|
||||||
|
if (!s)
|
||||||
|
return -ENOMEM;
|
||||||
|
|
||||||
|
ps->context = s;
|
||||||
|
|
||||||
|
hst_set_weights(ps, base_weight);
|
||||||
|
s->threshold_multiplier = threshold_multiplier;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void free_paths(struct list_head *paths)
|
||||||
|
{
|
||||||
|
struct path_info *pi, *next;
|
||||||
|
|
||||||
|
list_for_each_entry_safe(pi, next, paths, list) {
|
||||||
|
list_del(&pi->list);
|
||||||
|
kfree(pi);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void hst_destroy(struct path_selector *ps)
|
||||||
|
{
|
||||||
|
struct selector *s = ps->context;
|
||||||
|
|
||||||
|
free_paths(&s->valid_paths);
|
||||||
|
free_paths(&s->failed_paths);
|
||||||
|
kfree(s);
|
||||||
|
ps->context = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int hst_status(struct path_selector *ps, struct dm_path *path,
|
||||||
|
status_type_t type, char *result, unsigned int maxlen)
|
||||||
|
{
|
||||||
|
unsigned int sz = 0;
|
||||||
|
struct path_info *pi;
|
||||||
|
|
||||||
|
if (!path) {
|
||||||
|
struct selector *s = ps->context;
|
||||||
|
|
||||||
|
DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier);
|
||||||
|
} else {
|
||||||
|
pi = path->pscontext;
|
||||||
|
|
||||||
|
switch (type) {
|
||||||
|
case STATUSTYPE_INFO:
|
||||||
|
DMEMIT("%llu %llu %llu ", pi->historical_service_time,
|
||||||
|
pi->outstanding, pi->stale_after);
|
||||||
|
break;
|
||||||
|
case STATUSTYPE_TABLE:
|
||||||
|
DMEMIT("0 ");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return sz;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int hst_add_path(struct path_selector *ps, struct dm_path *path,
|
||||||
|
int argc, char **argv, char **error)
|
||||||
|
{
|
||||||
|
struct selector *s = ps->context;
|
||||||
|
struct path_info *pi;
|
||||||
|
unsigned int repeat_count = HST_MIN_IO;
|
||||||
|
char dummy;
|
||||||
|
unsigned long flags;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Arguments: [<repeat_count>]
|
||||||
|
* <repeat_count>: The number of I/Os before switching path.
|
||||||
|
* If not given, default (HST_MIN_IO) is used.
|
||||||
|
*/
|
||||||
|
if (argc > 1) {
|
||||||
|
*error = "historical-service-time ps: incorrect number of arguments";
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
|
||||||
|
*error = "historical-service-time ps: invalid repeat count";
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* allocate the path */
|
||||||
|
pi = kmalloc(sizeof(*pi), GFP_KERNEL);
|
||||||
|
if (!pi) {
|
||||||
|
*error = "historical-service-time ps: Error allocating path context";
|
||||||
|
return -ENOMEM;
|
||||||
|
}
|
||||||
|
|
||||||
|
pi->path = path;
|
||||||
|
pi->repeat_count = repeat_count;
|
||||||
|
|
||||||
|
pi->historical_service_time = HST_FIXED_1;
|
||||||
|
|
||||||
|
spin_lock_init(&pi->lock);
|
||||||
|
pi->outstanding = 0;
|
||||||
|
|
||||||
|
pi->stale_after = 0;
|
||||||
|
pi->last_finish = 0;
|
||||||
|
|
||||||
|
path->pscontext = pi;
|
||||||
|
|
||||||
|
spin_lock_irqsave(&s->lock, flags);
|
||||||
|
list_add_tail(&pi->list, &s->valid_paths);
|
||||||
|
s->valid_count++;
|
||||||
|
spin_unlock_irqrestore(&s->lock, flags);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void hst_fail_path(struct path_selector *ps, struct dm_path *path)
|
||||||
|
{
|
||||||
|
struct selector *s = ps->context;
|
||||||
|
struct path_info *pi = path->pscontext;
|
||||||
|
unsigned long flags;
|
||||||
|
|
||||||
|
spin_lock_irqsave(&s->lock, flags);
|
||||||
|
list_move(&pi->list, &s->failed_paths);
|
||||||
|
s->valid_count--;
|
||||||
|
spin_unlock_irqrestore(&s->lock, flags);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path)
|
||||||
|
{
|
||||||
|
struct selector *s = ps->context;
|
||||||
|
struct path_info *pi = path->pscontext;
|
||||||
|
unsigned long flags;
|
||||||
|
|
||||||
|
spin_lock_irqsave(&s->lock, flags);
|
||||||
|
list_move_tail(&pi->list, &s->valid_paths);
|
||||||
|
s->valid_count++;
|
||||||
|
spin_unlock_irqrestore(&s->lock, flags);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void hst_fill_compare(struct path_info *pi, u64 *hst,
|
||||||
|
u64 *out, u64 *stale)
|
||||||
|
{
|
||||||
|
unsigned long flags;
|
||||||
|
|
||||||
|
spin_lock_irqsave(&pi->lock, flags);
|
||||||
|
*hst = pi->historical_service_time;
|
||||||
|
*out = pi->outstanding;
|
||||||
|
*stale = pi->stale_after;
|
||||||
|
spin_unlock_irqrestore(&pi->lock, flags);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Compare the estimated service time of 2 paths, pi1 and pi2,
|
||||||
|
* for the incoming I/O.
|
||||||
|
*
|
||||||
|
* Returns:
|
||||||
|
* < 0 : pi1 is better
|
||||||
|
* 0 : no difference between pi1 and pi2
|
||||||
|
* > 0 : pi2 is better
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
static long long hst_compare(struct path_info *pi1, struct path_info *pi2,
|
||||||
|
u64 time_now, struct path_selector *ps)
|
||||||
|
{
|
||||||
|
struct selector *s = ps->context;
|
||||||
|
u64 hst1, hst2;
|
||||||
|
long long out1, out2, stale1, stale2;
|
||||||
|
int pi2_better, over_threshold;
|
||||||
|
|
||||||
|
hst_fill_compare(pi1, &hst1, &out1, &stale1);
|
||||||
|
hst_fill_compare(pi2, &hst2, &out2, &stale2);
|
||||||
|
|
||||||
|
/* Check here if estimated latency for two paths are too similar.
|
||||||
|
* If this is the case, we skip extra calculation and just compare
|
||||||
|
* outstanding requests. In this case, any unloaded paths will
|
||||||
|
* be preferred.
|
||||||
|
*/
|
||||||
|
if (hst1 > hst2)
|
||||||
|
over_threshold = hst1 > (s->threshold_multiplier * hst2);
|
||||||
|
else
|
||||||
|
over_threshold = hst2 > (s->threshold_multiplier * hst1);
|
||||||
|
|
||||||
|
if (!over_threshold)
|
||||||
|
return out1 - out2;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If an unloaded path is stale, choose it. If both paths are unloaded,
|
||||||
|
* choose path that is the most stale.
|
||||||
|
* (If one path is loaded, choose the other)
|
||||||
|
*/
|
||||||
|
if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) ||
|
||||||
|
(!out1 && !out2))
|
||||||
|
return (!out2 * stale1) - (!out1 * stale2);
|
||||||
|
|
||||||
|
/* Compare estimated service time. If outstanding is the same, we
|
||||||
|
* don't need to multiply
|
||||||
|
*/
|
||||||
|
if (out1 == out2) {
|
||||||
|
pi2_better = hst1 > hst2;
|
||||||
|
} else {
|
||||||
|
/* Potential overflow with out >= 1024 */
|
||||||
|
if (unlikely(out1 >= HST_MAX_INFLIGHT ||
|
||||||
|
out2 >= HST_MAX_INFLIGHT)) {
|
||||||
|
/* If over 1023 in-flights, we may overflow if hst
|
||||||
|
* is at max. (With this shift we still overflow at
|
||||||
|
* 1048576 in-flights, which is high enough).
|
||||||
|
*/
|
||||||
|
hst1 >>= HST_FIXED_SHIFT;
|
||||||
|
hst2 >>= HST_FIXED_SHIFT;
|
||||||
|
}
|
||||||
|
pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* In the case that the 'winner' is stale, limit to equal usage. */
|
||||||
|
if (pi2_better) {
|
||||||
|
if (stale2 < time_now)
|
||||||
|
return out1 - out2;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (stale1 < time_now)
|
||||||
|
return out1 - out2;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct dm_path *hst_select_path(struct path_selector *ps,
|
||||||
|
size_t nr_bytes)
|
||||||
|
{
|
||||||
|
struct selector *s = ps->context;
|
||||||
|
struct path_info *pi = NULL, *best = NULL;
|
||||||
|
u64 time_now = sched_clock();
|
||||||
|
struct dm_path *ret = NULL;
|
||||||
|
unsigned long flags;
|
||||||
|
|
||||||
|
spin_lock_irqsave(&s->lock, flags);
|
||||||
|
if (list_empty(&s->valid_paths))
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
list_for_each_entry(pi, &s->valid_paths, list) {
|
||||||
|
if (!best || (hst_compare(pi, best, time_now, ps) < 0))
|
||||||
|
best = pi;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!best)
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
/* Move last used path to end (least preferred in case of ties) */
|
||||||
|
list_move_tail(&best->list, &s->valid_paths);
|
||||||
|
|
||||||
|
ret = best->path;
|
||||||
|
|
||||||
|
out:
|
||||||
|
spin_unlock_irqrestore(&s->lock, flags);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int hst_start_io(struct path_selector *ps, struct dm_path *path,
|
||||||
|
size_t nr_bytes)
|
||||||
|
{
|
||||||
|
struct path_info *pi = path->pscontext;
|
||||||
|
unsigned long flags;
|
||||||
|
|
||||||
|
spin_lock_irqsave(&pi->lock, flags);
|
||||||
|
pi->outstanding++;
|
||||||
|
spin_unlock_irqrestore(&pi->lock, flags);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static u64 path_service_time(struct path_info *pi, u64 start_time)
|
||||||
|
{
|
||||||
|
u64 sched_now = ktime_get_ns();
|
||||||
|
|
||||||
|
/* if a previous disk request has finished after this IO was
|
||||||
|
* sent to the hardware, pretend the submission happened
|
||||||
|
* serially.
|
||||||
|
*/
|
||||||
|
if (time_after64(pi->last_finish, start_time))
|
||||||
|
start_time = pi->last_finish;
|
||||||
|
|
||||||
|
pi->last_finish = sched_now;
|
||||||
|
if (time_before64(sched_now, start_time))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
return sched_now - start_time;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int hst_end_io(struct path_selector *ps, struct dm_path *path,
|
||||||
|
size_t nr_bytes, u64 start_time)
|
||||||
|
{
|
||||||
|
struct path_info *pi = path->pscontext;
|
||||||
|
struct selector *s = ps->context;
|
||||||
|
unsigned long flags;
|
||||||
|
u64 st;
|
||||||
|
|
||||||
|
spin_lock_irqsave(&pi->lock, flags);
|
||||||
|
|
||||||
|
st = path_service_time(pi, start_time);
|
||||||
|
pi->outstanding--;
|
||||||
|
pi->historical_service_time =
|
||||||
|
fixed_ema(pi->historical_service_time,
|
||||||
|
min(st * HST_FIXED_1, HST_FIXED_MAX),
|
||||||
|
hst_weight(ps, st));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* On request end, mark path as fresh. If a path hasn't
|
||||||
|
* finished any requests within the fresh period, the estimated
|
||||||
|
* service time is considered too optimistic and we limit the
|
||||||
|
* maximum requests on that path.
|
||||||
|
*/
|
||||||
|
pi->stale_after = pi->last_finish +
|
||||||
|
(s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT));
|
||||||
|
|
||||||
|
spin_unlock_irqrestore(&pi->lock, flags);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct path_selector_type hst_ps = {
|
||||||
|
.name = "historical-service-time",
|
||||||
|
.module = THIS_MODULE,
|
||||||
|
.table_args = 1,
|
||||||
|
.info_args = 3,
|
||||||
|
.create = hst_create,
|
||||||
|
.destroy = hst_destroy,
|
||||||
|
.status = hst_status,
|
||||||
|
.add_path = hst_add_path,
|
||||||
|
.fail_path = hst_fail_path,
|
||||||
|
.reinstate_path = hst_reinstate_path,
|
||||||
|
.select_path = hst_select_path,
|
||||||
|
.start_io = hst_start_io,
|
||||||
|
.end_io = hst_end_io,
|
||||||
|
};
|
||||||
|
|
||||||
|
static int __init dm_hst_init(void)
|
||||||
|
{
|
||||||
|
int r = dm_register_path_selector(&hst_ps);
|
||||||
|
|
||||||
|
if (r < 0)
|
||||||
|
DMERR("register failed %d", r);
|
||||||
|
|
||||||
|
DMINFO("version " HST_VERSION " loaded");
|
||||||
|
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void __exit dm_hst_exit(void)
|
||||||
|
{
|
||||||
|
int r = dm_unregister_path_selector(&hst_ps);
|
||||||
|
|
||||||
|
if (r < 0)
|
||||||
|
DMERR("unregister failed %d", r);
|
||||||
|
}
|
||||||
|
|
||||||
|
module_init(dm_hst_init);
|
||||||
|
module_exit(dm_hst_exit);
|
||||||
|
|
||||||
|
MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector");
|
||||||
|
MODULE_AUTHOR("Khazhismel Kumykov <khazhy@google.com>");
|
||||||
|
MODULE_LICENSE("GPL");
|
@ -92,7 +92,7 @@ struct journal_entry {
|
|||||||
} s;
|
} s;
|
||||||
__u64 sector;
|
__u64 sector;
|
||||||
} u;
|
} u;
|
||||||
commit_id_t last_bytes[0];
|
commit_id_t last_bytes[];
|
||||||
/* __u8 tag[0]; */
|
/* __u8 tag[0]; */
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -1553,8 +1553,6 @@ static void integrity_metadata(struct work_struct *w)
|
|||||||
char checksums_onstack[max((size_t)HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
|
char checksums_onstack[max((size_t)HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
|
||||||
sector_t sector;
|
sector_t sector;
|
||||||
unsigned sectors_to_process;
|
unsigned sectors_to_process;
|
||||||
sector_t save_metadata_block;
|
|
||||||
unsigned save_metadata_offset;
|
|
||||||
|
|
||||||
if (unlikely(ic->mode == 'R'))
|
if (unlikely(ic->mode == 'R'))
|
||||||
goto skip_io;
|
goto skip_io;
|
||||||
@ -1605,8 +1603,6 @@ static void integrity_metadata(struct work_struct *w)
|
|||||||
goto skip_io;
|
goto skip_io;
|
||||||
}
|
}
|
||||||
|
|
||||||
save_metadata_block = dio->metadata_block;
|
|
||||||
save_metadata_offset = dio->metadata_offset;
|
|
||||||
sector = dio->range.logical_sector;
|
sector = dio->range.logical_sector;
|
||||||
sectors_to_process = dio->range.n_sectors;
|
sectors_to_process = dio->range.n_sectors;
|
||||||
|
|
||||||
|
@ -127,7 +127,7 @@ struct pending_block {
|
|||||||
char *data;
|
char *data;
|
||||||
u32 datalen;
|
u32 datalen;
|
||||||
struct list_head list;
|
struct list_head list;
|
||||||
struct bio_vec vecs[0];
|
struct bio_vec vecs[];
|
||||||
};
|
};
|
||||||
|
|
||||||
struct per_bio_data {
|
struct per_bio_data {
|
||||||
|
@ -439,7 +439,7 @@ failed:
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* dm_report_EIO() is a macro instead of a function to make pr_debug()
|
* dm_report_EIO() is a macro instead of a function to make pr_debug_ratelimited()
|
||||||
* report the function name and line number of the function from which
|
* report the function name and line number of the function from which
|
||||||
* it has been invoked.
|
* it has been invoked.
|
||||||
*/
|
*/
|
||||||
@ -447,43 +447,25 @@ failed:
|
|||||||
do { \
|
do { \
|
||||||
struct mapped_device *md = dm_table_get_md((m)->ti->table); \
|
struct mapped_device *md = dm_table_get_md((m)->ti->table); \
|
||||||
\
|
\
|
||||||
pr_debug("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d\n", \
|
DMDEBUG_LIMIT("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d", \
|
||||||
dm_device_name(md), \
|
dm_device_name(md), \
|
||||||
test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags), \
|
test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags), \
|
||||||
test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags), \
|
test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags), \
|
||||||
dm_noflush_suspending((m)->ti)); \
|
dm_noflush_suspending((m)->ti)); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Check whether bios must be queued in the device-mapper core rather
|
* Check whether bios must be queued in the device-mapper core rather
|
||||||
* than here in the target.
|
* than here in the target.
|
||||||
*
|
|
||||||
* If MPATHF_QUEUE_IF_NO_PATH and MPATHF_SAVED_QUEUE_IF_NO_PATH hold
|
|
||||||
* the same value then we are not between multipath_presuspend()
|
|
||||||
* and multipath_resume() calls and we have no need to check
|
|
||||||
* for the DMF_NOFLUSH_SUSPENDING flag.
|
|
||||||
*/
|
*/
|
||||||
static bool __must_push_back(struct multipath *m, unsigned long flags)
|
static bool __must_push_back(struct multipath *m)
|
||||||
{
|
{
|
||||||
return ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &flags) !=
|
return dm_noflush_suspending(m->ti);
|
||||||
test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &flags)) &&
|
|
||||||
dm_noflush_suspending(m->ti));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Following functions use READ_ONCE to get atomic access to
|
|
||||||
* all m->flags to avoid taking spinlock
|
|
||||||
*/
|
|
||||||
static bool must_push_back_rq(struct multipath *m)
|
static bool must_push_back_rq(struct multipath *m)
|
||||||
{
|
{
|
||||||
unsigned long flags = READ_ONCE(m->flags);
|
return test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || __must_push_back(m);
|
||||||
return test_bit(MPATHF_QUEUE_IF_NO_PATH, &flags) || __must_push_back(m, flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool must_push_back_bio(struct multipath *m)
|
|
||||||
{
|
|
||||||
unsigned long flags = READ_ONCE(m->flags);
|
|
||||||
return __must_push_back(m, flags);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -567,7 +549,8 @@ static void multipath_release_clone(struct request *clone,
|
|||||||
if (pgpath && pgpath->pg->ps.type->end_io)
|
if (pgpath && pgpath->pg->ps.type->end_io)
|
||||||
pgpath->pg->ps.type->end_io(&pgpath->pg->ps,
|
pgpath->pg->ps.type->end_io(&pgpath->pg->ps,
|
||||||
&pgpath->path,
|
&pgpath->path,
|
||||||
mpio->nr_bytes);
|
mpio->nr_bytes,
|
||||||
|
clone->io_start_time_ns);
|
||||||
}
|
}
|
||||||
|
|
||||||
blk_put_request(clone);
|
blk_put_request(clone);
|
||||||
@ -619,7 +602,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio,
|
|||||||
return DM_MAPIO_SUBMITTED;
|
return DM_MAPIO_SUBMITTED;
|
||||||
|
|
||||||
if (!pgpath) {
|
if (!pgpath) {
|
||||||
if (must_push_back_bio(m))
|
if (__must_push_back(m))
|
||||||
return DM_MAPIO_REQUEUE;
|
return DM_MAPIO_REQUEUE;
|
||||||
dm_report_EIO(m);
|
dm_report_EIO(m);
|
||||||
return DM_MAPIO_KILL;
|
return DM_MAPIO_KILL;
|
||||||
@ -709,15 +692,38 @@ static void process_queued_bios(struct work_struct *work)
|
|||||||
* If we run out of usable paths, should we queue I/O or error it?
|
* If we run out of usable paths, should we queue I/O or error it?
|
||||||
*/
|
*/
|
||||||
static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
|
static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
|
||||||
bool save_old_value)
|
bool save_old_value, const char *caller)
|
||||||
{
|
{
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
|
bool queue_if_no_path_bit, saved_queue_if_no_path_bit;
|
||||||
|
const char *dm_dev_name = dm_device_name(dm_table_get_md(m->ti->table));
|
||||||
|
|
||||||
|
DMDEBUG("%s: %s caller=%s queue_if_no_path=%d save_old_value=%d",
|
||||||
|
dm_dev_name, __func__, caller, queue_if_no_path, save_old_value);
|
||||||
|
|
||||||
spin_lock_irqsave(&m->lock, flags);
|
spin_lock_irqsave(&m->lock, flags);
|
||||||
assign_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags,
|
|
||||||
(save_old_value && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) ||
|
queue_if_no_path_bit = test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
|
||||||
(!save_old_value && queue_if_no_path));
|
saved_queue_if_no_path_bit = test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
|
||||||
|
|
||||||
|
if (save_old_value) {
|
||||||
|
if (unlikely(!queue_if_no_path_bit && saved_queue_if_no_path_bit)) {
|
||||||
|
DMERR("%s: QIFNP disabled but saved as enabled, saving again loses state, not saving!",
|
||||||
|
dm_dev_name);
|
||||||
|
} else
|
||||||
|
assign_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags, queue_if_no_path_bit);
|
||||||
|
} else if (!queue_if_no_path && saved_queue_if_no_path_bit) {
|
||||||
|
/* due to "fail_if_no_path" message, need to honor it. */
|
||||||
|
clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
|
||||||
|
}
|
||||||
assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags, queue_if_no_path);
|
assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags, queue_if_no_path);
|
||||||
|
|
||||||
|
DMDEBUG("%s: after %s changes; QIFNP = %d; SQIFNP = %d; DNFS = %d",
|
||||||
|
dm_dev_name, __func__,
|
||||||
|
test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags),
|
||||||
|
test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags),
|
||||||
|
dm_noflush_suspending(m->ti));
|
||||||
|
|
||||||
spin_unlock_irqrestore(&m->lock, flags);
|
spin_unlock_irqrestore(&m->lock, flags);
|
||||||
|
|
||||||
if (!queue_if_no_path) {
|
if (!queue_if_no_path) {
|
||||||
@ -738,7 +744,7 @@ static void queue_if_no_path_timeout_work(struct timer_list *t)
|
|||||||
struct mapped_device *md = dm_table_get_md(m->ti->table);
|
struct mapped_device *md = dm_table_get_md(m->ti->table);
|
||||||
|
|
||||||
DMWARN("queue_if_no_path timeout on %s, failing queued IO", dm_device_name(md));
|
DMWARN("queue_if_no_path timeout on %s, failing queued IO", dm_device_name(md));
|
||||||
queue_if_no_path(m, false, false);
|
queue_if_no_path(m, false, false, __func__);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1078,7 +1084,7 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
|
|||||||
argc--;
|
argc--;
|
||||||
|
|
||||||
if (!strcasecmp(arg_name, "queue_if_no_path")) {
|
if (!strcasecmp(arg_name, "queue_if_no_path")) {
|
||||||
r = queue_if_no_path(m, true, false);
|
r = queue_if_no_path(m, true, false, __func__);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1279,7 +1285,9 @@ static int fail_path(struct pgpath *pgpath)
|
|||||||
if (!pgpath->is_active)
|
if (!pgpath->is_active)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
DMWARN("Failing path %s.", pgpath->path.dev->name);
|
DMWARN("%s: Failing path %s.",
|
||||||
|
dm_device_name(dm_table_get_md(m->ti->table)),
|
||||||
|
pgpath->path.dev->name);
|
||||||
|
|
||||||
pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
|
pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
|
||||||
pgpath->is_active = false;
|
pgpath->is_active = false;
|
||||||
@ -1318,7 +1326,9 @@ static int reinstate_path(struct pgpath *pgpath)
|
|||||||
if (pgpath->is_active)
|
if (pgpath->is_active)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
DMWARN("Reinstating path %s.", pgpath->path.dev->name);
|
DMWARN("%s: Reinstating path %s.",
|
||||||
|
dm_device_name(dm_table_get_md(m->ti->table)),
|
||||||
|
pgpath->path.dev->name);
|
||||||
|
|
||||||
r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path);
|
r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path);
|
||||||
if (r)
|
if (r)
|
||||||
@ -1617,7 +1627,8 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
|
|||||||
struct path_selector *ps = &pgpath->pg->ps;
|
struct path_selector *ps = &pgpath->pg->ps;
|
||||||
|
|
||||||
if (ps->type->end_io)
|
if (ps->type->end_io)
|
||||||
ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
|
ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes,
|
||||||
|
clone->io_start_time_ns);
|
||||||
}
|
}
|
||||||
|
|
||||||
return r;
|
return r;
|
||||||
@ -1640,7 +1651,7 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
|
|||||||
|
|
||||||
if (atomic_read(&m->nr_valid_paths) == 0 &&
|
if (atomic_read(&m->nr_valid_paths) == 0 &&
|
||||||
!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
|
!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
|
||||||
if (must_push_back_bio(m)) {
|
if (__must_push_back(m)) {
|
||||||
r = DM_ENDIO_REQUEUE;
|
r = DM_ENDIO_REQUEUE;
|
||||||
} else {
|
} else {
|
||||||
dm_report_EIO(m);
|
dm_report_EIO(m);
|
||||||
@ -1661,23 +1672,27 @@ done:
|
|||||||
struct path_selector *ps = &pgpath->pg->ps;
|
struct path_selector *ps = &pgpath->pg->ps;
|
||||||
|
|
||||||
if (ps->type->end_io)
|
if (ps->type->end_io)
|
||||||
ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
|
ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes,
|
||||||
|
dm_start_time_ns_from_clone(clone));
|
||||||
}
|
}
|
||||||
|
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Suspend can't complete until all the I/O is processed so if
|
* Suspend with flush can't complete until all the I/O is processed
|
||||||
* the last path fails we must error any remaining I/O.
|
* so if the last path fails we must error any remaining I/O.
|
||||||
* Note that if the freeze_bdev fails while suspending, the
|
* - Note that if the freeze_bdev fails while suspending, the
|
||||||
* queue_if_no_path state is lost - userspace should reset it.
|
* queue_if_no_path state is lost - userspace should reset it.
|
||||||
|
* Otherwise, during noflush suspend, queue_if_no_path will not change.
|
||||||
*/
|
*/
|
||||||
static void multipath_presuspend(struct dm_target *ti)
|
static void multipath_presuspend(struct dm_target *ti)
|
||||||
{
|
{
|
||||||
struct multipath *m = ti->private;
|
struct multipath *m = ti->private;
|
||||||
|
|
||||||
queue_if_no_path(m, false, true);
|
/* FIXME: bio-based shouldn't need to always disable queue_if_no_path */
|
||||||
|
if (m->queue_mode == DM_TYPE_BIO_BASED || !dm_noflush_suspending(m->ti))
|
||||||
|
queue_if_no_path(m, false, true, __func__);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void multipath_postsuspend(struct dm_target *ti)
|
static void multipath_postsuspend(struct dm_target *ti)
|
||||||
@ -1698,8 +1713,16 @@ static void multipath_resume(struct dm_target *ti)
|
|||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
|
|
||||||
spin_lock_irqsave(&m->lock, flags);
|
spin_lock_irqsave(&m->lock, flags);
|
||||||
assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags,
|
if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) {
|
||||||
test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags));
|
set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
|
||||||
|
clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
|
||||||
|
}
|
||||||
|
|
||||||
|
DMDEBUG("%s: %s finished; QIFNP = %d; SQIFNP = %d",
|
||||||
|
dm_device_name(dm_table_get_md(m->ti->table)), __func__,
|
||||||
|
test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags),
|
||||||
|
test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags));
|
||||||
|
|
||||||
spin_unlock_irqrestore(&m->lock, flags);
|
spin_unlock_irqrestore(&m->lock, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1859,13 +1882,13 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv,
|
|||||||
|
|
||||||
if (argc == 1) {
|
if (argc == 1) {
|
||||||
if (!strcasecmp(argv[0], "queue_if_no_path")) {
|
if (!strcasecmp(argv[0], "queue_if_no_path")) {
|
||||||
r = queue_if_no_path(m, true, false);
|
r = queue_if_no_path(m, true, false, __func__);
|
||||||
spin_lock_irqsave(&m->lock, flags);
|
spin_lock_irqsave(&m->lock, flags);
|
||||||
enable_nopath_timeout(m);
|
enable_nopath_timeout(m);
|
||||||
spin_unlock_irqrestore(&m->lock, flags);
|
spin_unlock_irqrestore(&m->lock, flags);
|
||||||
goto out;
|
goto out;
|
||||||
} else if (!strcasecmp(argv[0], "fail_if_no_path")) {
|
} else if (!strcasecmp(argv[0], "fail_if_no_path")) {
|
||||||
r = queue_if_no_path(m, false, false);
|
r = queue_if_no_path(m, false, false, __func__);
|
||||||
disable_nopath_timeout(m);
|
disable_nopath_timeout(m);
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
@ -1918,7 +1941,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
|
|||||||
int r;
|
int r;
|
||||||
|
|
||||||
current_pgpath = READ_ONCE(m->current_pgpath);
|
current_pgpath = READ_ONCE(m->current_pgpath);
|
||||||
if (!current_pgpath)
|
if (!current_pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
|
||||||
current_pgpath = choose_pgpath(m, 0);
|
current_pgpath = choose_pgpath(m, 0);
|
||||||
|
|
||||||
if (current_pgpath) {
|
if (current_pgpath) {
|
||||||
|
@ -74,7 +74,7 @@ struct path_selector_type {
|
|||||||
int (*start_io) (struct path_selector *ps, struct dm_path *path,
|
int (*start_io) (struct path_selector *ps, struct dm_path *path,
|
||||||
size_t nr_bytes);
|
size_t nr_bytes);
|
||||||
int (*end_io) (struct path_selector *ps, struct dm_path *path,
|
int (*end_io) (struct path_selector *ps, struct dm_path *path,
|
||||||
size_t nr_bytes);
|
size_t nr_bytes, u64 start_time);
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Register a path selector */
|
/* Register a path selector */
|
||||||
|
@ -227,7 +227,7 @@ static int ql_start_io(struct path_selector *ps, struct dm_path *path,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static int ql_end_io(struct path_selector *ps, struct dm_path *path,
|
static int ql_end_io(struct path_selector *ps, struct dm_path *path,
|
||||||
size_t nr_bytes)
|
size_t nr_bytes, u64 start_time)
|
||||||
{
|
{
|
||||||
struct path_info *pi = path->pscontext;
|
struct path_info *pi = path->pscontext;
|
||||||
|
|
||||||
|
@ -254,7 +254,7 @@ struct raid_set {
|
|||||||
int mode;
|
int mode;
|
||||||
} journal_dev;
|
} journal_dev;
|
||||||
|
|
||||||
struct raid_dev dev[0];
|
struct raid_dev dev[];
|
||||||
};
|
};
|
||||||
|
|
||||||
static void rs_config_backup(struct raid_set *rs, struct rs_layout *l)
|
static void rs_config_backup(struct raid_set *rs, struct rs_layout *l)
|
||||||
|
@ -83,7 +83,7 @@ struct mirror_set {
|
|||||||
struct work_struct trigger_event;
|
struct work_struct trigger_event;
|
||||||
|
|
||||||
unsigned nr_mirrors;
|
unsigned nr_mirrors;
|
||||||
struct mirror mirror[0];
|
struct mirror mirror[];
|
||||||
};
|
};
|
||||||
|
|
||||||
DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(raid1_resync_throttle,
|
DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(raid1_resync_throttle,
|
||||||
|
@ -309,7 +309,7 @@ static int st_start_io(struct path_selector *ps, struct dm_path *path,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static int st_end_io(struct path_selector *ps, struct dm_path *path,
|
static int st_end_io(struct path_selector *ps, struct dm_path *path,
|
||||||
size_t nr_bytes)
|
size_t nr_bytes, u64 start_time)
|
||||||
{
|
{
|
||||||
struct path_info *pi = path->pscontext;
|
struct path_info *pi = path->pscontext;
|
||||||
|
|
||||||
|
@ -56,7 +56,7 @@ struct dm_stat {
|
|||||||
size_t percpu_alloc_size;
|
size_t percpu_alloc_size;
|
||||||
size_t histogram_alloc_size;
|
size_t histogram_alloc_size;
|
||||||
struct dm_stat_percpu *stat_percpu[NR_CPUS];
|
struct dm_stat_percpu *stat_percpu[NR_CPUS];
|
||||||
struct dm_stat_shared stat_shared[0];
|
struct dm_stat_shared stat_shared[];
|
||||||
};
|
};
|
||||||
|
|
||||||
#define STAT_PRECISE_TIMESTAMPS 1
|
#define STAT_PRECISE_TIMESTAMPS 1
|
||||||
|
@ -41,7 +41,7 @@ struct stripe_c {
|
|||||||
/* Work struct used for triggering events*/
|
/* Work struct used for triggering events*/
|
||||||
struct work_struct trigger_event;
|
struct work_struct trigger_event;
|
||||||
|
|
||||||
struct stripe stripe[0];
|
struct stripe stripe[];
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -53,7 +53,7 @@ struct switch_ctx {
|
|||||||
/*
|
/*
|
||||||
* Array of dm devices to switch between.
|
* Array of dm devices to switch between.
|
||||||
*/
|
*/
|
||||||
struct switch_path path_list[0];
|
struct switch_path path_list[];
|
||||||
};
|
};
|
||||||
|
|
||||||
static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths,
|
static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths,
|
||||||
|
@ -234,10 +234,6 @@ static int persistent_memory_claim(struct dm_writecache *wc)
|
|||||||
|
|
||||||
wc->memory_vmapped = false;
|
wc->memory_vmapped = false;
|
||||||
|
|
||||||
if (!wc->ssd_dev->dax_dev) {
|
|
||||||
r = -EOPNOTSUPP;
|
|
||||||
goto err1;
|
|
||||||
}
|
|
||||||
s = wc->memory_map_size;
|
s = wc->memory_map_size;
|
||||||
p = s >> PAGE_SHIFT;
|
p = s >> PAGE_SHIFT;
|
||||||
if (!p) {
|
if (!p) {
|
||||||
@ -1143,6 +1139,42 @@ static int writecache_message(struct dm_target *ti, unsigned argc, char **argv,
|
|||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void memcpy_flushcache_optimized(void *dest, void *source, size_t size)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* clflushopt performs better with block size 1024, 2048, 4096
|
||||||
|
* non-temporal stores perform better with block size 512
|
||||||
|
*
|
||||||
|
* block size 512 1024 2048 4096
|
||||||
|
* movnti 496 MB/s 642 MB/s 725 MB/s 744 MB/s
|
||||||
|
* clflushopt 373 MB/s 688 MB/s 1.1 GB/s 1.2 GB/s
|
||||||
|
*
|
||||||
|
* We see that movnti performs better for 512-byte blocks, and
|
||||||
|
* clflushopt performs better for 1024-byte and larger blocks. So, we
|
||||||
|
* prefer clflushopt for sizes >= 768.
|
||||||
|
*
|
||||||
|
* NOTE: this happens to be the case now (with dm-writecache's single
|
||||||
|
* threaded model) but re-evaluate this once memcpy_flushcache() is
|
||||||
|
* enabled to use movdir64b which might invalidate this performance
|
||||||
|
* advantage seen with cache-allocating-writes plus flushing.
|
||||||
|
*/
|
||||||
|
#ifdef CONFIG_X86
|
||||||
|
if (static_cpu_has(X86_FEATURE_CLFLUSHOPT) &&
|
||||||
|
likely(boot_cpu_data.x86_clflush_size == 64) &&
|
||||||
|
likely(size >= 768)) {
|
||||||
|
do {
|
||||||
|
memcpy((void *)dest, (void *)source, 64);
|
||||||
|
clflushopt((void *)dest);
|
||||||
|
dest += 64;
|
||||||
|
source += 64;
|
||||||
|
size -= 64;
|
||||||
|
} while (size >= 64);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
memcpy_flushcache(dest, source, size);
|
||||||
|
}
|
||||||
|
|
||||||
static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data)
|
static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data)
|
||||||
{
|
{
|
||||||
void *buf;
|
void *buf;
|
||||||
@ -1168,7 +1200,7 @@ static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
flush_dcache_page(bio_page(bio));
|
flush_dcache_page(bio_page(bio));
|
||||||
memcpy_flushcache(data, buf, size);
|
memcpy_flushcache_optimized(data, buf, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
bvec_kunmap_irq(buf, &flags);
|
bvec_kunmap_irq(buf, &flags);
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -13,7 +13,6 @@
|
|||||||
|
|
||||||
struct dmz_reclaim {
|
struct dmz_reclaim {
|
||||||
struct dmz_metadata *metadata;
|
struct dmz_metadata *metadata;
|
||||||
struct dmz_dev *dev;
|
|
||||||
|
|
||||||
struct delayed_work work;
|
struct delayed_work work;
|
||||||
struct workqueue_struct *wq;
|
struct workqueue_struct *wq;
|
||||||
@ -22,6 +21,8 @@ struct dmz_reclaim {
|
|||||||
struct dm_kcopyd_throttle kc_throttle;
|
struct dm_kcopyd_throttle kc_throttle;
|
||||||
int kc_err;
|
int kc_err;
|
||||||
|
|
||||||
|
int dev_idx;
|
||||||
|
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
|
|
||||||
/* Last target access time */
|
/* Last target access time */
|
||||||
@ -44,13 +45,13 @@ enum {
|
|||||||
* Percentage of unmapped (free) random zones below which reclaim starts
|
* Percentage of unmapped (free) random zones below which reclaim starts
|
||||||
* even if the target is busy.
|
* even if the target is busy.
|
||||||
*/
|
*/
|
||||||
#define DMZ_RECLAIM_LOW_UNMAP_RND 30
|
#define DMZ_RECLAIM_LOW_UNMAP_ZONES 30
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Percentage of unmapped (free) random zones above which reclaim will
|
* Percentage of unmapped (free) random zones above which reclaim will
|
||||||
* stop if the target is busy.
|
* stop if the target is busy.
|
||||||
*/
|
*/
|
||||||
#define DMZ_RECLAIM_HIGH_UNMAP_RND 50
|
#define DMZ_RECLAIM_HIGH_UNMAP_ZONES 50
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Align a sequential zone write pointer to chunk_block.
|
* Align a sequential zone write pointer to chunk_block.
|
||||||
@ -59,6 +60,7 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone,
|
|||||||
sector_t block)
|
sector_t block)
|
||||||
{
|
{
|
||||||
struct dmz_metadata *zmd = zrc->metadata;
|
struct dmz_metadata *zmd = zrc->metadata;
|
||||||
|
struct dmz_dev *dev = zone->dev;
|
||||||
sector_t wp_block = zone->wp_block;
|
sector_t wp_block = zone->wp_block;
|
||||||
unsigned int nr_blocks;
|
unsigned int nr_blocks;
|
||||||
int ret;
|
int ret;
|
||||||
@ -74,15 +76,15 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone,
|
|||||||
* pointer and the requested position.
|
* pointer and the requested position.
|
||||||
*/
|
*/
|
||||||
nr_blocks = block - wp_block;
|
nr_blocks = block - wp_block;
|
||||||
ret = blkdev_issue_zeroout(zrc->dev->bdev,
|
ret = blkdev_issue_zeroout(dev->bdev,
|
||||||
dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block),
|
dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block),
|
||||||
dmz_blk2sect(nr_blocks), GFP_NOIO, 0);
|
dmz_blk2sect(nr_blocks), GFP_NOIO, 0);
|
||||||
if (ret) {
|
if (ret) {
|
||||||
dmz_dev_err(zrc->dev,
|
dmz_dev_err(dev,
|
||||||
"Align zone %u wp %llu to %llu (wp+%u) blocks failed %d",
|
"Align zone %u wp %llu to %llu (wp+%u) blocks failed %d",
|
||||||
dmz_id(zmd, zone), (unsigned long long)wp_block,
|
zone->id, (unsigned long long)wp_block,
|
||||||
(unsigned long long)block, nr_blocks, ret);
|
(unsigned long long)block, nr_blocks, ret);
|
||||||
dmz_check_bdev(zrc->dev);
|
dmz_check_bdev(dev);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -116,7 +118,6 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc,
|
|||||||
struct dm_zone *src_zone, struct dm_zone *dst_zone)
|
struct dm_zone *src_zone, struct dm_zone *dst_zone)
|
||||||
{
|
{
|
||||||
struct dmz_metadata *zmd = zrc->metadata;
|
struct dmz_metadata *zmd = zrc->metadata;
|
||||||
struct dmz_dev *dev = zrc->dev;
|
|
||||||
struct dm_io_region src, dst;
|
struct dm_io_region src, dst;
|
||||||
sector_t block = 0, end_block;
|
sector_t block = 0, end_block;
|
||||||
sector_t nr_blocks;
|
sector_t nr_blocks;
|
||||||
@ -128,7 +129,7 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc,
|
|||||||
if (dmz_is_seq(src_zone))
|
if (dmz_is_seq(src_zone))
|
||||||
end_block = src_zone->wp_block;
|
end_block = src_zone->wp_block;
|
||||||
else
|
else
|
||||||
end_block = dev->zone_nr_blocks;
|
end_block = dmz_zone_nr_blocks(zmd);
|
||||||
src_zone_block = dmz_start_block(zmd, src_zone);
|
src_zone_block = dmz_start_block(zmd, src_zone);
|
||||||
dst_zone_block = dmz_start_block(zmd, dst_zone);
|
dst_zone_block = dmz_start_block(zmd, dst_zone);
|
||||||
|
|
||||||
@ -136,8 +137,13 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc,
|
|||||||
set_bit(DM_KCOPYD_WRITE_SEQ, &flags);
|
set_bit(DM_KCOPYD_WRITE_SEQ, &flags);
|
||||||
|
|
||||||
while (block < end_block) {
|
while (block < end_block) {
|
||||||
if (dev->flags & DMZ_BDEV_DYING)
|
if (src_zone->dev->flags & DMZ_BDEV_DYING)
|
||||||
return -EIO;
|
return -EIO;
|
||||||
|
if (dst_zone->dev->flags & DMZ_BDEV_DYING)
|
||||||
|
return -EIO;
|
||||||
|
|
||||||
|
if (dmz_reclaim_should_terminate(src_zone))
|
||||||
|
return -EINTR;
|
||||||
|
|
||||||
/* Get a valid region from the source zone */
|
/* Get a valid region from the source zone */
|
||||||
ret = dmz_first_valid_block(zmd, src_zone, &block);
|
ret = dmz_first_valid_block(zmd, src_zone, &block);
|
||||||
@ -156,11 +162,11 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc,
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
src.bdev = dev->bdev;
|
src.bdev = src_zone->dev->bdev;
|
||||||
src.sector = dmz_blk2sect(src_zone_block + block);
|
src.sector = dmz_blk2sect(src_zone_block + block);
|
||||||
src.count = dmz_blk2sect(nr_blocks);
|
src.count = dmz_blk2sect(nr_blocks);
|
||||||
|
|
||||||
dst.bdev = dev->bdev;
|
dst.bdev = dst_zone->dev->bdev;
|
||||||
dst.sector = dmz_blk2sect(dst_zone_block + block);
|
dst.sector = dmz_blk2sect(dst_zone_block + block);
|
||||||
dst.count = src.count;
|
dst.count = src.count;
|
||||||
|
|
||||||
@ -194,10 +200,10 @@ static int dmz_reclaim_buf(struct dmz_reclaim *zrc, struct dm_zone *dzone)
|
|||||||
struct dmz_metadata *zmd = zrc->metadata;
|
struct dmz_metadata *zmd = zrc->metadata;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
dmz_dev_debug(zrc->dev,
|
DMDEBUG("(%s/%u): Chunk %u, move buf zone %u (weight %u) to data zone %u (weight %u)",
|
||||||
"Chunk %u, move buf zone %u (weight %u) to data zone %u (weight %u)",
|
dmz_metadata_label(zmd), zrc->dev_idx,
|
||||||
dzone->chunk, dmz_id(zmd, bzone), dmz_weight(bzone),
|
dzone->chunk, bzone->id, dmz_weight(bzone),
|
||||||
dmz_id(zmd, dzone), dmz_weight(dzone));
|
dzone->id, dmz_weight(dzone));
|
||||||
|
|
||||||
/* Flush data zone into the buffer zone */
|
/* Flush data zone into the buffer zone */
|
||||||
ret = dmz_reclaim_copy(zrc, bzone, dzone);
|
ret = dmz_reclaim_copy(zrc, bzone, dzone);
|
||||||
@ -210,7 +216,7 @@ static int dmz_reclaim_buf(struct dmz_reclaim *zrc, struct dm_zone *dzone)
|
|||||||
ret = dmz_merge_valid_blocks(zmd, bzone, dzone, chunk_block);
|
ret = dmz_merge_valid_blocks(zmd, bzone, dzone, chunk_block);
|
||||||
if (ret == 0) {
|
if (ret == 0) {
|
||||||
/* Free the buffer zone */
|
/* Free the buffer zone */
|
||||||
dmz_invalidate_blocks(zmd, bzone, 0, zrc->dev->zone_nr_blocks);
|
dmz_invalidate_blocks(zmd, bzone, 0, dmz_zone_nr_blocks(zmd));
|
||||||
dmz_lock_map(zmd);
|
dmz_lock_map(zmd);
|
||||||
dmz_unmap_zone(zmd, bzone);
|
dmz_unmap_zone(zmd, bzone);
|
||||||
dmz_unlock_zone_reclaim(dzone);
|
dmz_unlock_zone_reclaim(dzone);
|
||||||
@ -233,10 +239,10 @@ static int dmz_reclaim_seq_data(struct dmz_reclaim *zrc, struct dm_zone *dzone)
|
|||||||
struct dmz_metadata *zmd = zrc->metadata;
|
struct dmz_metadata *zmd = zrc->metadata;
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
|
|
||||||
dmz_dev_debug(zrc->dev,
|
DMDEBUG("(%s/%u): Chunk %u, move data zone %u (weight %u) to buf zone %u (weight %u)",
|
||||||
"Chunk %u, move data zone %u (weight %u) to buf zone %u (weight %u)",
|
dmz_metadata_label(zmd), zrc->dev_idx,
|
||||||
chunk, dmz_id(zmd, dzone), dmz_weight(dzone),
|
chunk, dzone->id, dmz_weight(dzone),
|
||||||
dmz_id(zmd, bzone), dmz_weight(bzone));
|
bzone->id, dmz_weight(bzone));
|
||||||
|
|
||||||
/* Flush data zone into the buffer zone */
|
/* Flush data zone into the buffer zone */
|
||||||
ret = dmz_reclaim_copy(zrc, dzone, bzone);
|
ret = dmz_reclaim_copy(zrc, dzone, bzone);
|
||||||
@ -252,7 +258,7 @@ static int dmz_reclaim_seq_data(struct dmz_reclaim *zrc, struct dm_zone *dzone)
|
|||||||
* Free the data zone and remap the chunk to
|
* Free the data zone and remap the chunk to
|
||||||
* the buffer zone.
|
* the buffer zone.
|
||||||
*/
|
*/
|
||||||
dmz_invalidate_blocks(zmd, dzone, 0, zrc->dev->zone_nr_blocks);
|
dmz_invalidate_blocks(zmd, dzone, 0, dmz_zone_nr_blocks(zmd));
|
||||||
dmz_lock_map(zmd);
|
dmz_lock_map(zmd);
|
||||||
dmz_unmap_zone(zmd, bzone);
|
dmz_unmap_zone(zmd, bzone);
|
||||||
dmz_unmap_zone(zmd, dzone);
|
dmz_unmap_zone(zmd, dzone);
|
||||||
@ -277,18 +283,26 @@ static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone)
|
|||||||
struct dm_zone *szone = NULL;
|
struct dm_zone *szone = NULL;
|
||||||
struct dmz_metadata *zmd = zrc->metadata;
|
struct dmz_metadata *zmd = zrc->metadata;
|
||||||
int ret;
|
int ret;
|
||||||
|
int alloc_flags = DMZ_ALLOC_SEQ;
|
||||||
|
|
||||||
/* Get a free sequential zone */
|
/* Get a free random or sequential zone */
|
||||||
dmz_lock_map(zmd);
|
dmz_lock_map(zmd);
|
||||||
szone = dmz_alloc_zone(zmd, DMZ_ALLOC_RECLAIM);
|
again:
|
||||||
|
szone = dmz_alloc_zone(zmd, zrc->dev_idx,
|
||||||
|
alloc_flags | DMZ_ALLOC_RECLAIM);
|
||||||
|
if (!szone && alloc_flags == DMZ_ALLOC_SEQ && dmz_nr_cache_zones(zmd)) {
|
||||||
|
alloc_flags = DMZ_ALLOC_RND;
|
||||||
|
goto again;
|
||||||
|
}
|
||||||
dmz_unlock_map(zmd);
|
dmz_unlock_map(zmd);
|
||||||
if (!szone)
|
if (!szone)
|
||||||
return -ENOSPC;
|
return -ENOSPC;
|
||||||
|
|
||||||
dmz_dev_debug(zrc->dev,
|
DMDEBUG("(%s/%u): Chunk %u, move %s zone %u (weight %u) to %s zone %u",
|
||||||
"Chunk %u, move rnd zone %u (weight %u) to seq zone %u",
|
dmz_metadata_label(zmd), zrc->dev_idx, chunk,
|
||||||
chunk, dmz_id(zmd, dzone), dmz_weight(dzone),
|
dmz_is_cache(dzone) ? "cache" : "rnd",
|
||||||
dmz_id(zmd, szone));
|
dzone->id, dmz_weight(dzone),
|
||||||
|
dmz_is_rnd(szone) ? "rnd" : "seq", szone->id);
|
||||||
|
|
||||||
/* Flush the random data zone into the sequential zone */
|
/* Flush the random data zone into the sequential zone */
|
||||||
ret = dmz_reclaim_copy(zrc, dzone, szone);
|
ret = dmz_reclaim_copy(zrc, dzone, szone);
|
||||||
@ -306,7 +320,7 @@ static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone)
|
|||||||
dmz_unlock_map(zmd);
|
dmz_unlock_map(zmd);
|
||||||
} else {
|
} else {
|
||||||
/* Free the data zone and remap the chunk */
|
/* Free the data zone and remap the chunk */
|
||||||
dmz_invalidate_blocks(zmd, dzone, 0, zrc->dev->zone_nr_blocks);
|
dmz_invalidate_blocks(zmd, dzone, 0, dmz_zone_nr_blocks(zmd));
|
||||||
dmz_lock_map(zmd);
|
dmz_lock_map(zmd);
|
||||||
dmz_unmap_zone(zmd, dzone);
|
dmz_unmap_zone(zmd, dzone);
|
||||||
dmz_unlock_zone_reclaim(dzone);
|
dmz_unlock_zone_reclaim(dzone);
|
||||||
@ -336,6 +350,14 @@ static void dmz_reclaim_empty(struct dmz_reclaim *zrc, struct dm_zone *dzone)
|
|||||||
dmz_unlock_flush(zmd);
|
dmz_unlock_flush(zmd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Test if the target device is idle.
|
||||||
|
*/
|
||||||
|
static inline int dmz_target_idle(struct dmz_reclaim *zrc)
|
||||||
|
{
|
||||||
|
return time_is_before_jiffies(zrc->atime + DMZ_IDLE_PERIOD);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Find a candidate zone for reclaim and process it.
|
* Find a candidate zone for reclaim and process it.
|
||||||
*/
|
*/
|
||||||
@ -348,13 +370,16 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc)
|
|||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
/* Get a data zone */
|
/* Get a data zone */
|
||||||
dzone = dmz_get_zone_for_reclaim(zmd);
|
dzone = dmz_get_zone_for_reclaim(zmd, zrc->dev_idx,
|
||||||
if (IS_ERR(dzone))
|
dmz_target_idle(zrc));
|
||||||
return PTR_ERR(dzone);
|
if (!dzone) {
|
||||||
|
DMDEBUG("(%s/%u): No zone found to reclaim",
|
||||||
|
dmz_metadata_label(zmd), zrc->dev_idx);
|
||||||
|
return -EBUSY;
|
||||||
|
}
|
||||||
|
|
||||||
start = jiffies;
|
start = jiffies;
|
||||||
|
if (dmz_is_cache(dzone) || dmz_is_rnd(dzone)) {
|
||||||
if (dmz_is_rnd(dzone)) {
|
|
||||||
if (!dmz_weight(dzone)) {
|
if (!dmz_weight(dzone)) {
|
||||||
/* Empty zone */
|
/* Empty zone */
|
||||||
dmz_reclaim_empty(zrc, dzone);
|
dmz_reclaim_empty(zrc, dzone);
|
||||||
@ -395,54 +420,80 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc)
|
|||||||
}
|
}
|
||||||
out:
|
out:
|
||||||
if (ret) {
|
if (ret) {
|
||||||
|
if (ret == -EINTR)
|
||||||
|
DMDEBUG("(%s/%u): reclaim zone %u interrupted",
|
||||||
|
dmz_metadata_label(zmd), zrc->dev_idx,
|
||||||
|
rzone->id);
|
||||||
|
else
|
||||||
|
DMDEBUG("(%s/%u): Failed to reclaim zone %u, err %d",
|
||||||
|
dmz_metadata_label(zmd), zrc->dev_idx,
|
||||||
|
rzone->id, ret);
|
||||||
dmz_unlock_zone_reclaim(dzone);
|
dmz_unlock_zone_reclaim(dzone);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = dmz_flush_metadata(zrc->metadata);
|
ret = dmz_flush_metadata(zrc->metadata);
|
||||||
if (ret) {
|
if (ret) {
|
||||||
dmz_dev_debug(zrc->dev,
|
DMDEBUG("(%s/%u): Metadata flush for zone %u failed, err %d",
|
||||||
"Metadata flush for zone %u failed, err %d\n",
|
dmz_metadata_label(zmd), zrc->dev_idx, rzone->id, ret);
|
||||||
dmz_id(zmd, rzone), ret);
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
dmz_dev_debug(zrc->dev, "Reclaimed zone %u in %u ms",
|
DMDEBUG("(%s/%u): Reclaimed zone %u in %u ms",
|
||||||
dmz_id(zmd, rzone), jiffies_to_msecs(jiffies - start));
|
dmz_metadata_label(zmd), zrc->dev_idx,
|
||||||
|
rzone->id, jiffies_to_msecs(jiffies - start));
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
static unsigned int dmz_reclaim_percentage(struct dmz_reclaim *zrc)
|
||||||
* Test if the target device is idle.
|
|
||||||
*/
|
|
||||||
static inline int dmz_target_idle(struct dmz_reclaim *zrc)
|
|
||||||
{
|
{
|
||||||
return time_is_before_jiffies(zrc->atime + DMZ_IDLE_PERIOD);
|
struct dmz_metadata *zmd = zrc->metadata;
|
||||||
|
unsigned int nr_cache = dmz_nr_cache_zones(zmd);
|
||||||
|
unsigned int nr_unmap, nr_zones;
|
||||||
|
|
||||||
|
if (nr_cache) {
|
||||||
|
nr_zones = nr_cache;
|
||||||
|
nr_unmap = dmz_nr_unmap_cache_zones(zmd);
|
||||||
|
} else {
|
||||||
|
nr_zones = dmz_nr_rnd_zones(zmd, zrc->dev_idx);
|
||||||
|
nr_unmap = dmz_nr_unmap_rnd_zones(zmd, zrc->dev_idx);
|
||||||
|
}
|
||||||
|
return nr_unmap * 100 / nr_zones;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Test if reclaim is necessary.
|
* Test if reclaim is necessary.
|
||||||
*/
|
*/
|
||||||
static bool dmz_should_reclaim(struct dmz_reclaim *zrc)
|
static bool dmz_should_reclaim(struct dmz_reclaim *zrc, unsigned int p_unmap)
|
||||||
{
|
{
|
||||||
struct dmz_metadata *zmd = zrc->metadata;
|
unsigned int nr_reclaim;
|
||||||
unsigned int nr_rnd = dmz_nr_rnd_zones(zmd);
|
|
||||||
unsigned int nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd);
|
nr_reclaim = dmz_nr_rnd_zones(zrc->metadata, zrc->dev_idx);
|
||||||
unsigned int p_unmap_rnd = nr_unmap_rnd * 100 / nr_rnd;
|
|
||||||
|
if (dmz_nr_cache_zones(zrc->metadata)) {
|
||||||
|
/*
|
||||||
|
* The first device in a multi-device
|
||||||
|
* setup only contains cache zones, so
|
||||||
|
* never start reclaim there.
|
||||||
|
*/
|
||||||
|
if (zrc->dev_idx == 0)
|
||||||
|
return false;
|
||||||
|
nr_reclaim += dmz_nr_cache_zones(zrc->metadata);
|
||||||
|
}
|
||||||
|
|
||||||
/* Reclaim when idle */
|
/* Reclaim when idle */
|
||||||
if (dmz_target_idle(zrc) && nr_unmap_rnd < nr_rnd)
|
if (dmz_target_idle(zrc) && nr_reclaim)
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
/* If there are still plenty of random zones, do not reclaim */
|
/* If there are still plenty of cache zones, do not reclaim */
|
||||||
if (p_unmap_rnd >= DMZ_RECLAIM_HIGH_UNMAP_RND)
|
if (p_unmap >= DMZ_RECLAIM_HIGH_UNMAP_ZONES)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If the percentage of unmapped random zones is low,
|
* If the percentage of unmapped cache zones is low,
|
||||||
* reclaim even if the target is busy.
|
* reclaim even if the target is busy.
|
||||||
*/
|
*/
|
||||||
return p_unmap_rnd <= DMZ_RECLAIM_LOW_UNMAP_RND;
|
return p_unmap <= DMZ_RECLAIM_LOW_UNMAP_ZONES;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -452,14 +503,14 @@ static void dmz_reclaim_work(struct work_struct *work)
|
|||||||
{
|
{
|
||||||
struct dmz_reclaim *zrc = container_of(work, struct dmz_reclaim, work.work);
|
struct dmz_reclaim *zrc = container_of(work, struct dmz_reclaim, work.work);
|
||||||
struct dmz_metadata *zmd = zrc->metadata;
|
struct dmz_metadata *zmd = zrc->metadata;
|
||||||
unsigned int nr_rnd, nr_unmap_rnd;
|
unsigned int p_unmap, nr_unmap_rnd = 0, nr_rnd = 0;
|
||||||
unsigned int p_unmap_rnd;
|
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
if (dmz_bdev_is_dying(zrc->dev))
|
if (dmz_dev_is_dying(zmd))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (!dmz_should_reclaim(zrc)) {
|
p_unmap = dmz_reclaim_percentage(zrc);
|
||||||
|
if (!dmz_should_reclaim(zrc, p_unmap)) {
|
||||||
mod_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD);
|
mod_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -470,27 +521,29 @@ static void dmz_reclaim_work(struct work_struct *work)
|
|||||||
* and slower if there are still some free random zones to avoid
|
* and slower if there are still some free random zones to avoid
|
||||||
* as much as possible to negatively impact the user workload.
|
* as much as possible to negatively impact the user workload.
|
||||||
*/
|
*/
|
||||||
nr_rnd = dmz_nr_rnd_zones(zmd);
|
if (dmz_target_idle(zrc) || p_unmap < DMZ_RECLAIM_LOW_UNMAP_ZONES / 2) {
|
||||||
nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd);
|
|
||||||
p_unmap_rnd = nr_unmap_rnd * 100 / nr_rnd;
|
|
||||||
if (dmz_target_idle(zrc) || p_unmap_rnd < DMZ_RECLAIM_LOW_UNMAP_RND / 2) {
|
|
||||||
/* Idle or very low percentage: go fast */
|
/* Idle or very low percentage: go fast */
|
||||||
zrc->kc_throttle.throttle = 100;
|
zrc->kc_throttle.throttle = 100;
|
||||||
} else {
|
} else {
|
||||||
/* Busy but we still have some random zone: throttle */
|
/* Busy but we still have some random zone: throttle */
|
||||||
zrc->kc_throttle.throttle = min(75U, 100U - p_unmap_rnd / 2);
|
zrc->kc_throttle.throttle = min(75U, 100U - p_unmap / 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
dmz_dev_debug(zrc->dev,
|
nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd, zrc->dev_idx);
|
||||||
"Reclaim (%u): %s, %u%% free rnd zones (%u/%u)",
|
nr_rnd = dmz_nr_rnd_zones(zmd, zrc->dev_idx);
|
||||||
zrc->kc_throttle.throttle,
|
|
||||||
(dmz_target_idle(zrc) ? "Idle" : "Busy"),
|
DMDEBUG("(%s/%u): Reclaim (%u): %s, %u%% free zones (%u/%u cache %u/%u random)",
|
||||||
p_unmap_rnd, nr_unmap_rnd, nr_rnd);
|
dmz_metadata_label(zmd), zrc->dev_idx,
|
||||||
|
zrc->kc_throttle.throttle,
|
||||||
|
(dmz_target_idle(zrc) ? "Idle" : "Busy"),
|
||||||
|
p_unmap, dmz_nr_unmap_cache_zones(zmd),
|
||||||
|
dmz_nr_cache_zones(zmd),
|
||||||
|
dmz_nr_unmap_rnd_zones(zmd, zrc->dev_idx),
|
||||||
|
dmz_nr_rnd_zones(zmd, zrc->dev_idx));
|
||||||
|
|
||||||
ret = dmz_do_reclaim(zrc);
|
ret = dmz_do_reclaim(zrc);
|
||||||
if (ret) {
|
if (ret && ret != -EINTR) {
|
||||||
dmz_dev_debug(zrc->dev, "Reclaim error %d\n", ret);
|
if (!dmz_check_dev(zmd))
|
||||||
if (!dmz_check_bdev(zrc->dev))
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -500,8 +553,8 @@ static void dmz_reclaim_work(struct work_struct *work)
|
|||||||
/*
|
/*
|
||||||
* Initialize reclaim.
|
* Initialize reclaim.
|
||||||
*/
|
*/
|
||||||
int dmz_ctr_reclaim(struct dmz_dev *dev, struct dmz_metadata *zmd,
|
int dmz_ctr_reclaim(struct dmz_metadata *zmd,
|
||||||
struct dmz_reclaim **reclaim)
|
struct dmz_reclaim **reclaim, int idx)
|
||||||
{
|
{
|
||||||
struct dmz_reclaim *zrc;
|
struct dmz_reclaim *zrc;
|
||||||
int ret;
|
int ret;
|
||||||
@ -510,9 +563,9 @@ int dmz_ctr_reclaim(struct dmz_dev *dev, struct dmz_metadata *zmd,
|
|||||||
if (!zrc)
|
if (!zrc)
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
|
|
||||||
zrc->dev = dev;
|
|
||||||
zrc->metadata = zmd;
|
zrc->metadata = zmd;
|
||||||
zrc->atime = jiffies;
|
zrc->atime = jiffies;
|
||||||
|
zrc->dev_idx = idx;
|
||||||
|
|
||||||
/* Reclaim kcopyd client */
|
/* Reclaim kcopyd client */
|
||||||
zrc->kc = dm_kcopyd_client_create(&zrc->kc_throttle);
|
zrc->kc = dm_kcopyd_client_create(&zrc->kc_throttle);
|
||||||
@ -524,8 +577,8 @@ int dmz_ctr_reclaim(struct dmz_dev *dev, struct dmz_metadata *zmd,
|
|||||||
|
|
||||||
/* Reclaim work */
|
/* Reclaim work */
|
||||||
INIT_DELAYED_WORK(&zrc->work, dmz_reclaim_work);
|
INIT_DELAYED_WORK(&zrc->work, dmz_reclaim_work);
|
||||||
zrc->wq = alloc_ordered_workqueue("dmz_rwq_%s", WQ_MEM_RECLAIM,
|
zrc->wq = alloc_ordered_workqueue("dmz_rwq_%s_%d", WQ_MEM_RECLAIM,
|
||||||
dev->name);
|
dmz_metadata_label(zmd), idx);
|
||||||
if (!zrc->wq) {
|
if (!zrc->wq) {
|
||||||
ret = -ENOMEM;
|
ret = -ENOMEM;
|
||||||
goto err;
|
goto err;
|
||||||
@ -583,7 +636,8 @@ void dmz_reclaim_bio_acc(struct dmz_reclaim *zrc)
|
|||||||
*/
|
*/
|
||||||
void dmz_schedule_reclaim(struct dmz_reclaim *zrc)
|
void dmz_schedule_reclaim(struct dmz_reclaim *zrc)
|
||||||
{
|
{
|
||||||
if (dmz_should_reclaim(zrc))
|
unsigned int p_unmap = dmz_reclaim_percentage(zrc);
|
||||||
|
|
||||||
|
if (dmz_should_reclaim(zrc, p_unmap))
|
||||||
mod_delayed_work(zrc->wq, &zrc->work, 0);
|
mod_delayed_work(zrc->wq, &zrc->work, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17,7 +17,7 @@
|
|||||||
* Zone BIO context.
|
* Zone BIO context.
|
||||||
*/
|
*/
|
||||||
struct dmz_bioctx {
|
struct dmz_bioctx {
|
||||||
struct dmz_target *target;
|
struct dmz_dev *dev;
|
||||||
struct dm_zone *zone;
|
struct dm_zone *zone;
|
||||||
struct bio *bio;
|
struct bio *bio;
|
||||||
refcount_t ref;
|
refcount_t ref;
|
||||||
@ -38,9 +38,10 @@ struct dm_chunk_work {
|
|||||||
* Target descriptor.
|
* Target descriptor.
|
||||||
*/
|
*/
|
||||||
struct dmz_target {
|
struct dmz_target {
|
||||||
struct dm_dev *ddev;
|
struct dm_dev **ddev;
|
||||||
|
unsigned int nr_ddevs;
|
||||||
|
|
||||||
unsigned long flags;
|
unsigned int flags;
|
||||||
|
|
||||||
/* Zoned block device information */
|
/* Zoned block device information */
|
||||||
struct dmz_dev *dev;
|
struct dmz_dev *dev;
|
||||||
@ -48,9 +49,6 @@ struct dmz_target {
|
|||||||
/* For metadata handling */
|
/* For metadata handling */
|
||||||
struct dmz_metadata *metadata;
|
struct dmz_metadata *metadata;
|
||||||
|
|
||||||
/* For reclaim */
|
|
||||||
struct dmz_reclaim *reclaim;
|
|
||||||
|
|
||||||
/* For chunk work */
|
/* For chunk work */
|
||||||
struct radix_tree_root chunk_rxtree;
|
struct radix_tree_root chunk_rxtree;
|
||||||
struct workqueue_struct *chunk_wq;
|
struct workqueue_struct *chunk_wq;
|
||||||
@ -76,12 +74,13 @@ struct dmz_target {
|
|||||||
*/
|
*/
|
||||||
static inline void dmz_bio_endio(struct bio *bio, blk_status_t status)
|
static inline void dmz_bio_endio(struct bio *bio, blk_status_t status)
|
||||||
{
|
{
|
||||||
struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
|
struct dmz_bioctx *bioctx =
|
||||||
|
dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
|
||||||
|
|
||||||
if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK)
|
if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK)
|
||||||
bio->bi_status = status;
|
bio->bi_status = status;
|
||||||
if (bio->bi_status != BLK_STS_OK)
|
if (bioctx->dev && bio->bi_status != BLK_STS_OK)
|
||||||
bioctx->target->dev->flags |= DMZ_CHECK_BDEV;
|
bioctx->dev->flags |= DMZ_CHECK_BDEV;
|
||||||
|
|
||||||
if (refcount_dec_and_test(&bioctx->ref)) {
|
if (refcount_dec_and_test(&bioctx->ref)) {
|
||||||
struct dm_zone *zone = bioctx->zone;
|
struct dm_zone *zone = bioctx->zone;
|
||||||
@ -118,14 +117,20 @@ static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone,
|
|||||||
struct bio *bio, sector_t chunk_block,
|
struct bio *bio, sector_t chunk_block,
|
||||||
unsigned int nr_blocks)
|
unsigned int nr_blocks)
|
||||||
{
|
{
|
||||||
struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
|
struct dmz_bioctx *bioctx =
|
||||||
|
dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
|
||||||
|
struct dmz_dev *dev = zone->dev;
|
||||||
struct bio *clone;
|
struct bio *clone;
|
||||||
|
|
||||||
|
if (dev->flags & DMZ_BDEV_DYING)
|
||||||
|
return -EIO;
|
||||||
|
|
||||||
clone = bio_clone_fast(bio, GFP_NOIO, &dmz->bio_set);
|
clone = bio_clone_fast(bio, GFP_NOIO, &dmz->bio_set);
|
||||||
if (!clone)
|
if (!clone)
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
|
|
||||||
bio_set_dev(clone, dmz->dev->bdev);
|
bio_set_dev(clone, dev->bdev);
|
||||||
|
bioctx->dev = dev;
|
||||||
clone->bi_iter.bi_sector =
|
clone->bi_iter.bi_sector =
|
||||||
dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block);
|
dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block);
|
||||||
clone->bi_iter.bi_size = dmz_blk2sect(nr_blocks) << SECTOR_SHIFT;
|
clone->bi_iter.bi_size = dmz_blk2sect(nr_blocks) << SECTOR_SHIFT;
|
||||||
@ -165,7 +170,8 @@ static void dmz_handle_read_zero(struct dmz_target *dmz, struct bio *bio,
|
|||||||
static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone,
|
static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone,
|
||||||
struct bio *bio)
|
struct bio *bio)
|
||||||
{
|
{
|
||||||
sector_t chunk_block = dmz_chunk_block(dmz->dev, dmz_bio_block(bio));
|
struct dmz_metadata *zmd = dmz->metadata;
|
||||||
|
sector_t chunk_block = dmz_chunk_block(zmd, dmz_bio_block(bio));
|
||||||
unsigned int nr_blocks = dmz_bio_blocks(bio);
|
unsigned int nr_blocks = dmz_bio_blocks(bio);
|
||||||
sector_t end_block = chunk_block + nr_blocks;
|
sector_t end_block = chunk_block + nr_blocks;
|
||||||
struct dm_zone *rzone, *bzone;
|
struct dm_zone *rzone, *bzone;
|
||||||
@ -177,19 +183,22 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
dmz_dev_debug(dmz->dev, "READ chunk %llu -> %s zone %u, block %llu, %u blocks",
|
DMDEBUG("(%s): READ chunk %llu -> %s zone %u, block %llu, %u blocks",
|
||||||
(unsigned long long)dmz_bio_chunk(dmz->dev, bio),
|
dmz_metadata_label(zmd),
|
||||||
(dmz_is_rnd(zone) ? "RND" : "SEQ"),
|
(unsigned long long)dmz_bio_chunk(zmd, bio),
|
||||||
dmz_id(dmz->metadata, zone),
|
(dmz_is_rnd(zone) ? "RND" :
|
||||||
(unsigned long long)chunk_block, nr_blocks);
|
(dmz_is_cache(zone) ? "CACHE" : "SEQ")),
|
||||||
|
zone->id,
|
||||||
|
(unsigned long long)chunk_block, nr_blocks);
|
||||||
|
|
||||||
/* Check block validity to determine the read location */
|
/* Check block validity to determine the read location */
|
||||||
bzone = zone->bzone;
|
bzone = zone->bzone;
|
||||||
while (chunk_block < end_block) {
|
while (chunk_block < end_block) {
|
||||||
nr_blocks = 0;
|
nr_blocks = 0;
|
||||||
if (dmz_is_rnd(zone) || chunk_block < zone->wp_block) {
|
if (dmz_is_rnd(zone) || dmz_is_cache(zone) ||
|
||||||
|
chunk_block < zone->wp_block) {
|
||||||
/* Test block validity in the data zone */
|
/* Test block validity in the data zone */
|
||||||
ret = dmz_block_valid(dmz->metadata, zone, chunk_block);
|
ret = dmz_block_valid(zmd, zone, chunk_block);
|
||||||
if (ret < 0)
|
if (ret < 0)
|
||||||
return ret;
|
return ret;
|
||||||
if (ret > 0) {
|
if (ret > 0) {
|
||||||
@ -204,7 +213,7 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone,
|
|||||||
* Check the buffer zone, if there is one.
|
* Check the buffer zone, if there is one.
|
||||||
*/
|
*/
|
||||||
if (!nr_blocks && bzone) {
|
if (!nr_blocks && bzone) {
|
||||||
ret = dmz_block_valid(dmz->metadata, bzone, chunk_block);
|
ret = dmz_block_valid(zmd, bzone, chunk_block);
|
||||||
if (ret < 0)
|
if (ret < 0)
|
||||||
return ret;
|
return ret;
|
||||||
if (ret > 0) {
|
if (ret > 0) {
|
||||||
@ -216,8 +225,10 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone,
|
|||||||
|
|
||||||
if (nr_blocks) {
|
if (nr_blocks) {
|
||||||
/* Valid blocks found: read them */
|
/* Valid blocks found: read them */
|
||||||
nr_blocks = min_t(unsigned int, nr_blocks, end_block - chunk_block);
|
nr_blocks = min_t(unsigned int, nr_blocks,
|
||||||
ret = dmz_submit_bio(dmz, rzone, bio, chunk_block, nr_blocks);
|
end_block - chunk_block);
|
||||||
|
ret = dmz_submit_bio(dmz, rzone, bio,
|
||||||
|
chunk_block, nr_blocks);
|
||||||
if (ret)
|
if (ret)
|
||||||
return ret;
|
return ret;
|
||||||
chunk_block += nr_blocks;
|
chunk_block += nr_blocks;
|
||||||
@ -308,25 +319,30 @@ static int dmz_handle_buffered_write(struct dmz_target *dmz,
|
|||||||
static int dmz_handle_write(struct dmz_target *dmz, struct dm_zone *zone,
|
static int dmz_handle_write(struct dmz_target *dmz, struct dm_zone *zone,
|
||||||
struct bio *bio)
|
struct bio *bio)
|
||||||
{
|
{
|
||||||
sector_t chunk_block = dmz_chunk_block(dmz->dev, dmz_bio_block(bio));
|
struct dmz_metadata *zmd = dmz->metadata;
|
||||||
|
sector_t chunk_block = dmz_chunk_block(zmd, dmz_bio_block(bio));
|
||||||
unsigned int nr_blocks = dmz_bio_blocks(bio);
|
unsigned int nr_blocks = dmz_bio_blocks(bio);
|
||||||
|
|
||||||
if (!zone)
|
if (!zone)
|
||||||
return -ENOSPC;
|
return -ENOSPC;
|
||||||
|
|
||||||
dmz_dev_debug(dmz->dev, "WRITE chunk %llu -> %s zone %u, block %llu, %u blocks",
|
DMDEBUG("(%s): WRITE chunk %llu -> %s zone %u, block %llu, %u blocks",
|
||||||
(unsigned long long)dmz_bio_chunk(dmz->dev, bio),
|
dmz_metadata_label(zmd),
|
||||||
(dmz_is_rnd(zone) ? "RND" : "SEQ"),
|
(unsigned long long)dmz_bio_chunk(zmd, bio),
|
||||||
dmz_id(dmz->metadata, zone),
|
(dmz_is_rnd(zone) ? "RND" :
|
||||||
(unsigned long long)chunk_block, nr_blocks);
|
(dmz_is_cache(zone) ? "CACHE" : "SEQ")),
|
||||||
|
zone->id,
|
||||||
|
(unsigned long long)chunk_block, nr_blocks);
|
||||||
|
|
||||||
if (dmz_is_rnd(zone) || chunk_block == zone->wp_block) {
|
if (dmz_is_rnd(zone) || dmz_is_cache(zone) ||
|
||||||
|
chunk_block == zone->wp_block) {
|
||||||
/*
|
/*
|
||||||
* zone is a random zone or it is a sequential zone
|
* zone is a random zone or it is a sequential zone
|
||||||
* and the BIO is aligned to the zone write pointer:
|
* and the BIO is aligned to the zone write pointer:
|
||||||
* direct write the zone.
|
* direct write the zone.
|
||||||
*/
|
*/
|
||||||
return dmz_handle_direct_write(dmz, zone, bio, chunk_block, nr_blocks);
|
return dmz_handle_direct_write(dmz, zone, bio,
|
||||||
|
chunk_block, nr_blocks);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -345,7 +361,7 @@ static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone,
|
|||||||
struct dmz_metadata *zmd = dmz->metadata;
|
struct dmz_metadata *zmd = dmz->metadata;
|
||||||
sector_t block = dmz_bio_block(bio);
|
sector_t block = dmz_bio_block(bio);
|
||||||
unsigned int nr_blocks = dmz_bio_blocks(bio);
|
unsigned int nr_blocks = dmz_bio_blocks(bio);
|
||||||
sector_t chunk_block = dmz_chunk_block(dmz->dev, block);
|
sector_t chunk_block = dmz_chunk_block(zmd, block);
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
|
|
||||||
/* For unmapped chunks, there is nothing to do */
|
/* For unmapped chunks, there is nothing to do */
|
||||||
@ -355,16 +371,18 @@ static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone,
|
|||||||
if (dmz_is_readonly(zone))
|
if (dmz_is_readonly(zone))
|
||||||
return -EROFS;
|
return -EROFS;
|
||||||
|
|
||||||
dmz_dev_debug(dmz->dev, "DISCARD chunk %llu -> zone %u, block %llu, %u blocks",
|
DMDEBUG("(%s): DISCARD chunk %llu -> zone %u, block %llu, %u blocks",
|
||||||
(unsigned long long)dmz_bio_chunk(dmz->dev, bio),
|
dmz_metadata_label(dmz->metadata),
|
||||||
dmz_id(zmd, zone),
|
(unsigned long long)dmz_bio_chunk(zmd, bio),
|
||||||
(unsigned long long)chunk_block, nr_blocks);
|
zone->id,
|
||||||
|
(unsigned long long)chunk_block, nr_blocks);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Invalidate blocks in the data zone and its
|
* Invalidate blocks in the data zone and its
|
||||||
* buffer zone if one is mapped.
|
* buffer zone if one is mapped.
|
||||||
*/
|
*/
|
||||||
if (dmz_is_rnd(zone) || chunk_block < zone->wp_block)
|
if (dmz_is_rnd(zone) || dmz_is_cache(zone) ||
|
||||||
|
chunk_block < zone->wp_block)
|
||||||
ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks);
|
ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks);
|
||||||
if (ret == 0 && zone->bzone)
|
if (ret == 0 && zone->bzone)
|
||||||
ret = dmz_invalidate_blocks(zmd, zone->bzone,
|
ret = dmz_invalidate_blocks(zmd, zone->bzone,
|
||||||
@ -378,31 +396,28 @@ static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone,
|
|||||||
static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw,
|
static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw,
|
||||||
struct bio *bio)
|
struct bio *bio)
|
||||||
{
|
{
|
||||||
struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
|
struct dmz_bioctx *bioctx =
|
||||||
|
dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
|
||||||
struct dmz_metadata *zmd = dmz->metadata;
|
struct dmz_metadata *zmd = dmz->metadata;
|
||||||
struct dm_zone *zone;
|
struct dm_zone *zone;
|
||||||
int ret;
|
int i, ret;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Write may trigger a zone allocation. So make sure the
|
* Write may trigger a zone allocation. So make sure the
|
||||||
* allocation can succeed.
|
* allocation can succeed.
|
||||||
*/
|
*/
|
||||||
if (bio_op(bio) == REQ_OP_WRITE)
|
if (bio_op(bio) == REQ_OP_WRITE)
|
||||||
dmz_schedule_reclaim(dmz->reclaim);
|
for (i = 0; i < dmz->nr_ddevs; i++)
|
||||||
|
dmz_schedule_reclaim(dmz->dev[i].reclaim);
|
||||||
|
|
||||||
dmz_lock_metadata(zmd);
|
dmz_lock_metadata(zmd);
|
||||||
|
|
||||||
if (dmz->dev->flags & DMZ_BDEV_DYING) {
|
|
||||||
ret = -EIO;
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Get the data zone mapping the chunk. There may be no
|
* Get the data zone mapping the chunk. There may be no
|
||||||
* mapping for read and discard. If a mapping is obtained,
|
* mapping for read and discard. If a mapping is obtained,
|
||||||
+ the zone returned will be set to active state.
|
+ the zone returned will be set to active state.
|
||||||
*/
|
*/
|
||||||
zone = dmz_get_chunk_mapping(zmd, dmz_bio_chunk(dmz->dev, bio),
|
zone = dmz_get_chunk_mapping(zmd, dmz_bio_chunk(zmd, bio),
|
||||||
bio_op(bio));
|
bio_op(bio));
|
||||||
if (IS_ERR(zone)) {
|
if (IS_ERR(zone)) {
|
||||||
ret = PTR_ERR(zone);
|
ret = PTR_ERR(zone);
|
||||||
@ -413,6 +428,7 @@ static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw,
|
|||||||
if (zone) {
|
if (zone) {
|
||||||
dmz_activate_zone(zone);
|
dmz_activate_zone(zone);
|
||||||
bioctx->zone = zone;
|
bioctx->zone = zone;
|
||||||
|
dmz_reclaim_bio_acc(zone->dev->reclaim);
|
||||||
}
|
}
|
||||||
|
|
||||||
switch (bio_op(bio)) {
|
switch (bio_op(bio)) {
|
||||||
@ -427,8 +443,8 @@ static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw,
|
|||||||
ret = dmz_handle_discard(dmz, zone, bio);
|
ret = dmz_handle_discard(dmz, zone, bio);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
dmz_dev_err(dmz->dev, "Unsupported BIO operation 0x%x",
|
DMERR("(%s): Unsupported BIO operation 0x%x",
|
||||||
bio_op(bio));
|
dmz_metadata_label(dmz->metadata), bio_op(bio));
|
||||||
ret = -EIO;
|
ret = -EIO;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -502,7 +518,8 @@ static void dmz_flush_work(struct work_struct *work)
|
|||||||
/* Flush dirty metadata blocks */
|
/* Flush dirty metadata blocks */
|
||||||
ret = dmz_flush_metadata(dmz->metadata);
|
ret = dmz_flush_metadata(dmz->metadata);
|
||||||
if (ret)
|
if (ret)
|
||||||
dmz_dev_debug(dmz->dev, "Metadata flush failed, rc=%d\n", ret);
|
DMDEBUG("(%s): Metadata flush failed, rc=%d",
|
||||||
|
dmz_metadata_label(dmz->metadata), ret);
|
||||||
|
|
||||||
/* Process queued flush requests */
|
/* Process queued flush requests */
|
||||||
while (1) {
|
while (1) {
|
||||||
@ -525,7 +542,7 @@ static void dmz_flush_work(struct work_struct *work)
|
|||||||
*/
|
*/
|
||||||
static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
|
static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
|
||||||
{
|
{
|
||||||
unsigned int chunk = dmz_bio_chunk(dmz->dev, bio);
|
unsigned int chunk = dmz_bio_chunk(dmz->metadata, bio);
|
||||||
struct dm_chunk_work *cw;
|
struct dm_chunk_work *cw;
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
|
|
||||||
@ -558,7 +575,6 @@ static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
|
|||||||
|
|
||||||
bio_list_add(&cw->bio_list, bio);
|
bio_list_add(&cw->bio_list, bio);
|
||||||
|
|
||||||
dmz_reclaim_bio_acc(dmz->reclaim);
|
|
||||||
if (queue_work(dmz->chunk_wq, &cw->work))
|
if (queue_work(dmz->chunk_wq, &cw->work))
|
||||||
dmz_get_chunk_work(cw);
|
dmz_get_chunk_work(cw);
|
||||||
out:
|
out:
|
||||||
@ -618,23 +634,22 @@ bool dmz_check_bdev(struct dmz_dev *dmz_dev)
|
|||||||
static int dmz_map(struct dm_target *ti, struct bio *bio)
|
static int dmz_map(struct dm_target *ti, struct bio *bio)
|
||||||
{
|
{
|
||||||
struct dmz_target *dmz = ti->private;
|
struct dmz_target *dmz = ti->private;
|
||||||
struct dmz_dev *dev = dmz->dev;
|
struct dmz_metadata *zmd = dmz->metadata;
|
||||||
struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
|
struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
|
||||||
sector_t sector = bio->bi_iter.bi_sector;
|
sector_t sector = bio->bi_iter.bi_sector;
|
||||||
unsigned int nr_sectors = bio_sectors(bio);
|
unsigned int nr_sectors = bio_sectors(bio);
|
||||||
sector_t chunk_sector;
|
sector_t chunk_sector;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
if (dmz_bdev_is_dying(dmz->dev))
|
if (dmz_dev_is_dying(zmd))
|
||||||
return DM_MAPIO_KILL;
|
return DM_MAPIO_KILL;
|
||||||
|
|
||||||
dmz_dev_debug(dev, "BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks",
|
DMDEBUG("(%s): BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks",
|
||||||
bio_op(bio), (unsigned long long)sector, nr_sectors,
|
dmz_metadata_label(zmd),
|
||||||
(unsigned long long)dmz_bio_chunk(dmz->dev, bio),
|
bio_op(bio), (unsigned long long)sector, nr_sectors,
|
||||||
(unsigned long long)dmz_chunk_block(dmz->dev, dmz_bio_block(bio)),
|
(unsigned long long)dmz_bio_chunk(zmd, bio),
|
||||||
(unsigned int)dmz_bio_blocks(bio));
|
(unsigned long long)dmz_chunk_block(zmd, dmz_bio_block(bio)),
|
||||||
|
(unsigned int)dmz_bio_blocks(bio));
|
||||||
bio_set_dev(bio, dev->bdev);
|
|
||||||
|
|
||||||
if (!nr_sectors && bio_op(bio) != REQ_OP_WRITE)
|
if (!nr_sectors && bio_op(bio) != REQ_OP_WRITE)
|
||||||
return DM_MAPIO_REMAPPED;
|
return DM_MAPIO_REMAPPED;
|
||||||
@ -644,7 +659,7 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
|
|||||||
return DM_MAPIO_KILL;
|
return DM_MAPIO_KILL;
|
||||||
|
|
||||||
/* Initialize the BIO context */
|
/* Initialize the BIO context */
|
||||||
bioctx->target = dmz;
|
bioctx->dev = NULL;
|
||||||
bioctx->zone = NULL;
|
bioctx->zone = NULL;
|
||||||
bioctx->bio = bio;
|
bioctx->bio = bio;
|
||||||
refcount_set(&bioctx->ref, 1);
|
refcount_set(&bioctx->ref, 1);
|
||||||
@ -659,17 +674,17 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Split zone BIOs to fit entirely into a zone */
|
/* Split zone BIOs to fit entirely into a zone */
|
||||||
chunk_sector = sector & (dev->zone_nr_sectors - 1);
|
chunk_sector = sector & (dmz_zone_nr_sectors(zmd) - 1);
|
||||||
if (chunk_sector + nr_sectors > dev->zone_nr_sectors)
|
if (chunk_sector + nr_sectors > dmz_zone_nr_sectors(zmd))
|
||||||
dm_accept_partial_bio(bio, dev->zone_nr_sectors - chunk_sector);
|
dm_accept_partial_bio(bio, dmz_zone_nr_sectors(zmd) - chunk_sector);
|
||||||
|
|
||||||
/* Now ready to handle this BIO */
|
/* Now ready to handle this BIO */
|
||||||
ret = dmz_queue_chunk_work(dmz, bio);
|
ret = dmz_queue_chunk_work(dmz, bio);
|
||||||
if (ret) {
|
if (ret) {
|
||||||
dmz_dev_debug(dmz->dev,
|
DMDEBUG("(%s): BIO op %d, can't process chunk %llu, err %i",
|
||||||
"BIO op %d, can't process chunk %llu, err %i\n",
|
dmz_metadata_label(zmd),
|
||||||
bio_op(bio), (u64)dmz_bio_chunk(dmz->dev, bio),
|
bio_op(bio), (u64)dmz_bio_chunk(zmd, bio),
|
||||||
ret);
|
ret);
|
||||||
return DM_MAPIO_REQUEUE;
|
return DM_MAPIO_REQUEUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -679,64 +694,65 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
|
|||||||
/*
|
/*
|
||||||
* Get zoned device information.
|
* Get zoned device information.
|
||||||
*/
|
*/
|
||||||
static int dmz_get_zoned_device(struct dm_target *ti, char *path)
|
static int dmz_get_zoned_device(struct dm_target *ti, char *path,
|
||||||
|
int idx, int nr_devs)
|
||||||
{
|
{
|
||||||
struct dmz_target *dmz = ti->private;
|
struct dmz_target *dmz = ti->private;
|
||||||
struct request_queue *q;
|
struct dm_dev *ddev;
|
||||||
struct dmz_dev *dev;
|
struct dmz_dev *dev;
|
||||||
sector_t aligned_capacity;
|
|
||||||
int ret;
|
int ret;
|
||||||
|
struct block_device *bdev;
|
||||||
|
|
||||||
/* Get the target device */
|
/* Get the target device */
|
||||||
ret = dm_get_device(ti, path, dm_table_get_mode(ti->table), &dmz->ddev);
|
ret = dm_get_device(ti, path, dm_table_get_mode(ti->table), &ddev);
|
||||||
if (ret) {
|
if (ret) {
|
||||||
ti->error = "Get target device failed";
|
ti->error = "Get target device failed";
|
||||||
dmz->ddev = NULL;
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
dev = kzalloc(sizeof(struct dmz_dev), GFP_KERNEL);
|
bdev = ddev->bdev;
|
||||||
if (!dev) {
|
if (bdev_zoned_model(bdev) == BLK_ZONED_NONE) {
|
||||||
ret = -ENOMEM;
|
if (nr_devs == 1) {
|
||||||
goto err;
|
ti->error = "Invalid regular device";
|
||||||
|
goto err;
|
||||||
|
}
|
||||||
|
if (idx != 0) {
|
||||||
|
ti->error = "First device must be a regular device";
|
||||||
|
goto err;
|
||||||
|
}
|
||||||
|
if (dmz->ddev[0]) {
|
||||||
|
ti->error = "Too many regular devices";
|
||||||
|
goto err;
|
||||||
|
}
|
||||||
|
dev = &dmz->dev[idx];
|
||||||
|
dev->flags = DMZ_BDEV_REGULAR;
|
||||||
|
} else {
|
||||||
|
if (dmz->ddev[idx]) {
|
||||||
|
ti->error = "Too many zoned devices";
|
||||||
|
goto err;
|
||||||
|
}
|
||||||
|
if (nr_devs > 1 && idx == 0) {
|
||||||
|
ti->error = "First device must be a regular device";
|
||||||
|
goto err;
|
||||||
|
}
|
||||||
|
dev = &dmz->dev[idx];
|
||||||
}
|
}
|
||||||
|
dev->bdev = bdev;
|
||||||
dev->bdev = dmz->ddev->bdev;
|
dev->dev_idx = idx;
|
||||||
(void)bdevname(dev->bdev, dev->name);
|
(void)bdevname(dev->bdev, dev->name);
|
||||||
|
|
||||||
if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE) {
|
dev->capacity = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
|
||||||
ti->error = "Not a zoned block device";
|
if (ti->begin) {
|
||||||
ret = -EINVAL;
|
ti->error = "Partial mapping is not supported";
|
||||||
goto err;
|
goto err;
|
||||||
}
|
}
|
||||||
|
|
||||||
q = bdev_get_queue(dev->bdev);
|
dmz->ddev[idx] = ddev;
|
||||||
dev->capacity = i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
|
|
||||||
aligned_capacity = dev->capacity &
|
|
||||||
~((sector_t)blk_queue_zone_sectors(q) - 1);
|
|
||||||
if (ti->begin ||
|
|
||||||
((ti->len != dev->capacity) && (ti->len != aligned_capacity))) {
|
|
||||||
ti->error = "Partial mapping not supported";
|
|
||||||
ret = -EINVAL;
|
|
||||||
goto err;
|
|
||||||
}
|
|
||||||
|
|
||||||
dev->zone_nr_sectors = blk_queue_zone_sectors(q);
|
|
||||||
dev->zone_nr_sectors_shift = ilog2(dev->zone_nr_sectors);
|
|
||||||
|
|
||||||
dev->zone_nr_blocks = dmz_sect2blk(dev->zone_nr_sectors);
|
|
||||||
dev->zone_nr_blocks_shift = ilog2(dev->zone_nr_blocks);
|
|
||||||
|
|
||||||
dev->nr_zones = blkdev_nr_zones(dev->bdev->bd_disk);
|
|
||||||
|
|
||||||
dmz->dev = dev;
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
err:
|
err:
|
||||||
dm_put_device(ti, dmz->ddev);
|
dm_put_device(ti, ddev);
|
||||||
kfree(dev);
|
return -EINVAL;
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -745,10 +761,78 @@ err:
|
|||||||
static void dmz_put_zoned_device(struct dm_target *ti)
|
static void dmz_put_zoned_device(struct dm_target *ti)
|
||||||
{
|
{
|
||||||
struct dmz_target *dmz = ti->private;
|
struct dmz_target *dmz = ti->private;
|
||||||
|
int i;
|
||||||
|
|
||||||
dm_put_device(ti, dmz->ddev);
|
for (i = 0; i < dmz->nr_ddevs; i++) {
|
||||||
kfree(dmz->dev);
|
if (dmz->ddev[i]) {
|
||||||
dmz->dev = NULL;
|
dm_put_device(ti, dmz->ddev[i]);
|
||||||
|
dmz->ddev[i] = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int dmz_fixup_devices(struct dm_target *ti)
|
||||||
|
{
|
||||||
|
struct dmz_target *dmz = ti->private;
|
||||||
|
struct dmz_dev *reg_dev, *zoned_dev;
|
||||||
|
struct request_queue *q;
|
||||||
|
sector_t zone_nr_sectors = 0;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* When we have more than on devices, the first one must be a
|
||||||
|
* regular block device and the others zoned block devices.
|
||||||
|
*/
|
||||||
|
if (dmz->nr_ddevs > 1) {
|
||||||
|
reg_dev = &dmz->dev[0];
|
||||||
|
if (!(reg_dev->flags & DMZ_BDEV_REGULAR)) {
|
||||||
|
ti->error = "Primary disk is not a regular device";
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
for (i = 1; i < dmz->nr_ddevs; i++) {
|
||||||
|
zoned_dev = &dmz->dev[i];
|
||||||
|
if (zoned_dev->flags & DMZ_BDEV_REGULAR) {
|
||||||
|
ti->error = "Secondary disk is not a zoned device";
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
q = bdev_get_queue(zoned_dev->bdev);
|
||||||
|
if (zone_nr_sectors &&
|
||||||
|
zone_nr_sectors != blk_queue_zone_sectors(q)) {
|
||||||
|
ti->error = "Zone nr sectors mismatch";
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
zone_nr_sectors = blk_queue_zone_sectors(q);
|
||||||
|
zoned_dev->zone_nr_sectors = zone_nr_sectors;
|
||||||
|
zoned_dev->nr_zones =
|
||||||
|
blkdev_nr_zones(zoned_dev->bdev->bd_disk);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
reg_dev = NULL;
|
||||||
|
zoned_dev = &dmz->dev[0];
|
||||||
|
if (zoned_dev->flags & DMZ_BDEV_REGULAR) {
|
||||||
|
ti->error = "Disk is not a zoned device";
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
q = bdev_get_queue(zoned_dev->bdev);
|
||||||
|
zoned_dev->zone_nr_sectors = blk_queue_zone_sectors(q);
|
||||||
|
zoned_dev->nr_zones = blkdev_nr_zones(zoned_dev->bdev->bd_disk);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (reg_dev) {
|
||||||
|
sector_t zone_offset;
|
||||||
|
|
||||||
|
reg_dev->zone_nr_sectors = zone_nr_sectors;
|
||||||
|
reg_dev->nr_zones =
|
||||||
|
DIV_ROUND_UP_SECTOR_T(reg_dev->capacity,
|
||||||
|
reg_dev->zone_nr_sectors);
|
||||||
|
reg_dev->zone_offset = 0;
|
||||||
|
zone_offset = reg_dev->nr_zones;
|
||||||
|
for (i = 1; i < dmz->nr_ddevs; i++) {
|
||||||
|
dmz->dev[i].zone_offset = zone_offset;
|
||||||
|
zone_offset += dmz->dev[i].nr_zones;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -757,11 +841,10 @@ static void dmz_put_zoned_device(struct dm_target *ti)
|
|||||||
static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
|
static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
|
||||||
{
|
{
|
||||||
struct dmz_target *dmz;
|
struct dmz_target *dmz;
|
||||||
struct dmz_dev *dev;
|
int ret, i;
|
||||||
int ret;
|
|
||||||
|
|
||||||
/* Check arguments */
|
/* Check arguments */
|
||||||
if (argc != 1) {
|
if (argc < 1) {
|
||||||
ti->error = "Invalid argument count";
|
ti->error = "Invalid argument count";
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
@ -772,25 +855,42 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
|
|||||||
ti->error = "Unable to allocate the zoned target descriptor";
|
ti->error = "Unable to allocate the zoned target descriptor";
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
}
|
}
|
||||||
|
dmz->dev = kcalloc(argc, sizeof(struct dmz_dev), GFP_KERNEL);
|
||||||
|
if (!dmz->dev) {
|
||||||
|
ti->error = "Unable to allocate the zoned device descriptors";
|
||||||
|
kfree(dmz);
|
||||||
|
return -ENOMEM;
|
||||||
|
}
|
||||||
|
dmz->ddev = kcalloc(argc, sizeof(struct dm_dev *), GFP_KERNEL);
|
||||||
|
if (!dmz->ddev) {
|
||||||
|
ti->error = "Unable to allocate the dm device descriptors";
|
||||||
|
ret = -ENOMEM;
|
||||||
|
goto err;
|
||||||
|
}
|
||||||
|
dmz->nr_ddevs = argc;
|
||||||
|
|
||||||
ti->private = dmz;
|
ti->private = dmz;
|
||||||
|
|
||||||
/* Get the target zoned block device */
|
/* Get the target zoned block device */
|
||||||
ret = dmz_get_zoned_device(ti, argv[0]);
|
for (i = 0; i < argc; i++) {
|
||||||
if (ret) {
|
ret = dmz_get_zoned_device(ti, argv[i], i, argc);
|
||||||
dmz->ddev = NULL;
|
if (ret)
|
||||||
goto err;
|
goto err_dev;
|
||||||
}
|
}
|
||||||
|
ret = dmz_fixup_devices(ti);
|
||||||
|
if (ret)
|
||||||
|
goto err_dev;
|
||||||
|
|
||||||
/* Initialize metadata */
|
/* Initialize metadata */
|
||||||
dev = dmz->dev;
|
ret = dmz_ctr_metadata(dmz->dev, argc, &dmz->metadata,
|
||||||
ret = dmz_ctr_metadata(dev, &dmz->metadata);
|
dm_table_device_name(ti->table));
|
||||||
if (ret) {
|
if (ret) {
|
||||||
ti->error = "Metadata initialization failed";
|
ti->error = "Metadata initialization failed";
|
||||||
goto err_dev;
|
goto err_dev;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Set target (no write same support) */
|
/* Set target (no write same support) */
|
||||||
ti->max_io_len = dev->zone_nr_sectors << 9;
|
ti->max_io_len = dmz_zone_nr_sectors(dmz->metadata) << 9;
|
||||||
ti->num_flush_bios = 1;
|
ti->num_flush_bios = 1;
|
||||||
ti->num_discard_bios = 1;
|
ti->num_discard_bios = 1;
|
||||||
ti->num_write_zeroes_bios = 1;
|
ti->num_write_zeroes_bios = 1;
|
||||||
@ -799,7 +899,8 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
|
|||||||
ti->discards_supported = true;
|
ti->discards_supported = true;
|
||||||
|
|
||||||
/* The exposed capacity is the number of chunks that can be mapped */
|
/* The exposed capacity is the number of chunks that can be mapped */
|
||||||
ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) << dev->zone_nr_sectors_shift;
|
ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) <<
|
||||||
|
dmz_zone_nr_sectors_shift(dmz->metadata);
|
||||||
|
|
||||||
/* Zone BIO */
|
/* Zone BIO */
|
||||||
ret = bioset_init(&dmz->bio_set, DMZ_MIN_BIOS, 0, 0);
|
ret = bioset_init(&dmz->bio_set, DMZ_MIN_BIOS, 0, 0);
|
||||||
@ -811,8 +912,9 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
|
|||||||
/* Chunk BIO work */
|
/* Chunk BIO work */
|
||||||
mutex_init(&dmz->chunk_lock);
|
mutex_init(&dmz->chunk_lock);
|
||||||
INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_NOIO);
|
INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_NOIO);
|
||||||
dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s", WQ_MEM_RECLAIM | WQ_UNBOUND,
|
dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s",
|
||||||
0, dev->name);
|
WQ_MEM_RECLAIM | WQ_UNBOUND, 0,
|
||||||
|
dmz_metadata_label(dmz->metadata));
|
||||||
if (!dmz->chunk_wq) {
|
if (!dmz->chunk_wq) {
|
||||||
ti->error = "Create chunk workqueue failed";
|
ti->error = "Create chunk workqueue failed";
|
||||||
ret = -ENOMEM;
|
ret = -ENOMEM;
|
||||||
@ -824,7 +926,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
|
|||||||
bio_list_init(&dmz->flush_list);
|
bio_list_init(&dmz->flush_list);
|
||||||
INIT_DELAYED_WORK(&dmz->flush_work, dmz_flush_work);
|
INIT_DELAYED_WORK(&dmz->flush_work, dmz_flush_work);
|
||||||
dmz->flush_wq = alloc_ordered_workqueue("dmz_fwq_%s", WQ_MEM_RECLAIM,
|
dmz->flush_wq = alloc_ordered_workqueue("dmz_fwq_%s", WQ_MEM_RECLAIM,
|
||||||
dev->name);
|
dmz_metadata_label(dmz->metadata));
|
||||||
if (!dmz->flush_wq) {
|
if (!dmz->flush_wq) {
|
||||||
ti->error = "Create flush workqueue failed";
|
ti->error = "Create flush workqueue failed";
|
||||||
ret = -ENOMEM;
|
ret = -ENOMEM;
|
||||||
@ -833,15 +935,18 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
|
|||||||
mod_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
|
mod_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
|
||||||
|
|
||||||
/* Initialize reclaim */
|
/* Initialize reclaim */
|
||||||
ret = dmz_ctr_reclaim(dev, dmz->metadata, &dmz->reclaim);
|
for (i = 0; i < dmz->nr_ddevs; i++) {
|
||||||
if (ret) {
|
ret = dmz_ctr_reclaim(dmz->metadata, &dmz->dev[i].reclaim, i);
|
||||||
ti->error = "Zone reclaim initialization failed";
|
if (ret) {
|
||||||
goto err_fwq;
|
ti->error = "Zone reclaim initialization failed";
|
||||||
|
goto err_fwq;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
dmz_dev_info(dev, "Target device: %llu 512-byte logical sectors (%llu blocks)",
|
DMINFO("(%s): Target device: %llu 512-byte logical sectors (%llu blocks)",
|
||||||
(unsigned long long)ti->len,
|
dmz_metadata_label(dmz->metadata),
|
||||||
(unsigned long long)dmz_sect2blk(ti->len));
|
(unsigned long long)ti->len,
|
||||||
|
(unsigned long long)dmz_sect2blk(ti->len));
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
err_fwq:
|
err_fwq:
|
||||||
@ -856,6 +961,7 @@ err_meta:
|
|||||||
err_dev:
|
err_dev:
|
||||||
dmz_put_zoned_device(ti);
|
dmz_put_zoned_device(ti);
|
||||||
err:
|
err:
|
||||||
|
kfree(dmz->dev);
|
||||||
kfree(dmz);
|
kfree(dmz);
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
@ -867,11 +973,13 @@ err:
|
|||||||
static void dmz_dtr(struct dm_target *ti)
|
static void dmz_dtr(struct dm_target *ti)
|
||||||
{
|
{
|
||||||
struct dmz_target *dmz = ti->private;
|
struct dmz_target *dmz = ti->private;
|
||||||
|
int i;
|
||||||
|
|
||||||
flush_workqueue(dmz->chunk_wq);
|
flush_workqueue(dmz->chunk_wq);
|
||||||
destroy_workqueue(dmz->chunk_wq);
|
destroy_workqueue(dmz->chunk_wq);
|
||||||
|
|
||||||
dmz_dtr_reclaim(dmz->reclaim);
|
for (i = 0; i < dmz->nr_ddevs; i++)
|
||||||
|
dmz_dtr_reclaim(dmz->dev[i].reclaim);
|
||||||
|
|
||||||
cancel_delayed_work_sync(&dmz->flush_work);
|
cancel_delayed_work_sync(&dmz->flush_work);
|
||||||
destroy_workqueue(dmz->flush_wq);
|
destroy_workqueue(dmz->flush_wq);
|
||||||
@ -886,6 +994,7 @@ static void dmz_dtr(struct dm_target *ti)
|
|||||||
|
|
||||||
mutex_destroy(&dmz->chunk_lock);
|
mutex_destroy(&dmz->chunk_lock);
|
||||||
|
|
||||||
|
kfree(dmz->dev);
|
||||||
kfree(dmz);
|
kfree(dmz);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -895,7 +1004,7 @@ static void dmz_dtr(struct dm_target *ti)
|
|||||||
static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits)
|
static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits)
|
||||||
{
|
{
|
||||||
struct dmz_target *dmz = ti->private;
|
struct dmz_target *dmz = ti->private;
|
||||||
unsigned int chunk_sectors = dmz->dev->zone_nr_sectors;
|
unsigned int chunk_sectors = dmz_zone_nr_sectors(dmz->metadata);
|
||||||
|
|
||||||
limits->logical_block_size = DMZ_BLOCK_SIZE;
|
limits->logical_block_size = DMZ_BLOCK_SIZE;
|
||||||
limits->physical_block_size = DMZ_BLOCK_SIZE;
|
limits->physical_block_size = DMZ_BLOCK_SIZE;
|
||||||
@ -923,11 +1032,12 @@ static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits)
|
|||||||
static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
|
static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
|
||||||
{
|
{
|
||||||
struct dmz_target *dmz = ti->private;
|
struct dmz_target *dmz = ti->private;
|
||||||
|
struct dmz_dev *dev = &dmz->dev[0];
|
||||||
|
|
||||||
if (!dmz_check_bdev(dmz->dev))
|
if (!dmz_check_bdev(dev))
|
||||||
return -EIO;
|
return -EIO;
|
||||||
|
|
||||||
*bdev = dmz->dev->bdev;
|
*bdev = dev->bdev;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -938,9 +1048,11 @@ static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
|
|||||||
static void dmz_suspend(struct dm_target *ti)
|
static void dmz_suspend(struct dm_target *ti)
|
||||||
{
|
{
|
||||||
struct dmz_target *dmz = ti->private;
|
struct dmz_target *dmz = ti->private;
|
||||||
|
int i;
|
||||||
|
|
||||||
flush_workqueue(dmz->chunk_wq);
|
flush_workqueue(dmz->chunk_wq);
|
||||||
dmz_suspend_reclaim(dmz->reclaim);
|
for (i = 0; i < dmz->nr_ddevs; i++)
|
||||||
|
dmz_suspend_reclaim(dmz->dev[i].reclaim);
|
||||||
cancel_delayed_work_sync(&dmz->flush_work);
|
cancel_delayed_work_sync(&dmz->flush_work);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -950,24 +1062,95 @@ static void dmz_suspend(struct dm_target *ti)
|
|||||||
static void dmz_resume(struct dm_target *ti)
|
static void dmz_resume(struct dm_target *ti)
|
||||||
{
|
{
|
||||||
struct dmz_target *dmz = ti->private;
|
struct dmz_target *dmz = ti->private;
|
||||||
|
int i;
|
||||||
|
|
||||||
queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
|
queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
|
||||||
dmz_resume_reclaim(dmz->reclaim);
|
for (i = 0; i < dmz->nr_ddevs; i++)
|
||||||
|
dmz_resume_reclaim(dmz->dev[i].reclaim);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int dmz_iterate_devices(struct dm_target *ti,
|
static int dmz_iterate_devices(struct dm_target *ti,
|
||||||
iterate_devices_callout_fn fn, void *data)
|
iterate_devices_callout_fn fn, void *data)
|
||||||
{
|
{
|
||||||
struct dmz_target *dmz = ti->private;
|
struct dmz_target *dmz = ti->private;
|
||||||
struct dmz_dev *dev = dmz->dev;
|
unsigned int zone_nr_sectors = dmz_zone_nr_sectors(dmz->metadata);
|
||||||
sector_t capacity = dev->capacity & ~(dev->zone_nr_sectors - 1);
|
sector_t capacity;
|
||||||
|
int i, r;
|
||||||
|
|
||||||
return fn(ti, dmz->ddev, 0, capacity, data);
|
for (i = 0; i < dmz->nr_ddevs; i++) {
|
||||||
|
capacity = dmz->dev[i].capacity & ~(zone_nr_sectors - 1);
|
||||||
|
r = fn(ti, dmz->ddev[i], 0, capacity, data);
|
||||||
|
if (r)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void dmz_status(struct dm_target *ti, status_type_t type,
|
||||||
|
unsigned int status_flags, char *result,
|
||||||
|
unsigned int maxlen)
|
||||||
|
{
|
||||||
|
struct dmz_target *dmz = ti->private;
|
||||||
|
ssize_t sz = 0;
|
||||||
|
char buf[BDEVNAME_SIZE];
|
||||||
|
struct dmz_dev *dev;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
switch (type) {
|
||||||
|
case STATUSTYPE_INFO:
|
||||||
|
DMEMIT("%u zones %u/%u cache",
|
||||||
|
dmz_nr_zones(dmz->metadata),
|
||||||
|
dmz_nr_unmap_cache_zones(dmz->metadata),
|
||||||
|
dmz_nr_cache_zones(dmz->metadata));
|
||||||
|
for (i = 0; i < dmz->nr_ddevs; i++) {
|
||||||
|
/*
|
||||||
|
* For a multi-device setup the first device
|
||||||
|
* contains only cache zones.
|
||||||
|
*/
|
||||||
|
if ((i == 0) &&
|
||||||
|
(dmz_nr_cache_zones(dmz->metadata) > 0))
|
||||||
|
continue;
|
||||||
|
DMEMIT(" %u/%u random %u/%u sequential",
|
||||||
|
dmz_nr_unmap_rnd_zones(dmz->metadata, i),
|
||||||
|
dmz_nr_rnd_zones(dmz->metadata, i),
|
||||||
|
dmz_nr_unmap_seq_zones(dmz->metadata, i),
|
||||||
|
dmz_nr_seq_zones(dmz->metadata, i));
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case STATUSTYPE_TABLE:
|
||||||
|
dev = &dmz->dev[0];
|
||||||
|
format_dev_t(buf, dev->bdev->bd_dev);
|
||||||
|
DMEMIT("%s", buf);
|
||||||
|
for (i = 1; i < dmz->nr_ddevs; i++) {
|
||||||
|
dev = &dmz->dev[i];
|
||||||
|
format_dev_t(buf, dev->bdev->bd_dev);
|
||||||
|
DMEMIT(" %s", buf);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int dmz_message(struct dm_target *ti, unsigned int argc, char **argv,
|
||||||
|
char *result, unsigned int maxlen)
|
||||||
|
{
|
||||||
|
struct dmz_target *dmz = ti->private;
|
||||||
|
int r = -EINVAL;
|
||||||
|
|
||||||
|
if (!strcasecmp(argv[0], "reclaim")) {
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 0; i < dmz->nr_ddevs; i++)
|
||||||
|
dmz_schedule_reclaim(dmz->dev[i].reclaim);
|
||||||
|
r = 0;
|
||||||
|
} else
|
||||||
|
DMERR("unrecognized message %s", argv[0]);
|
||||||
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct target_type dmz_type = {
|
static struct target_type dmz_type = {
|
||||||
.name = "zoned",
|
.name = "zoned",
|
||||||
.version = {1, 1, 0},
|
.version = {2, 0, 0},
|
||||||
.features = DM_TARGET_SINGLETON | DM_TARGET_ZONED_HM,
|
.features = DM_TARGET_SINGLETON | DM_TARGET_ZONED_HM,
|
||||||
.module = THIS_MODULE,
|
.module = THIS_MODULE,
|
||||||
.ctr = dmz_ctr,
|
.ctr = dmz_ctr,
|
||||||
@ -978,6 +1161,8 @@ static struct target_type dmz_type = {
|
|||||||
.postsuspend = dmz_suspend,
|
.postsuspend = dmz_suspend,
|
||||||
.resume = dmz_resume,
|
.resume = dmz_resume,
|
||||||
.iterate_devices = dmz_iterate_devices,
|
.iterate_devices = dmz_iterate_devices,
|
||||||
|
.status = dmz_status,
|
||||||
|
.message = dmz_message,
|
||||||
};
|
};
|
||||||
|
|
||||||
static int __init dmz_init(void)
|
static int __init dmz_init(void)
|
||||||
|
@ -45,34 +45,50 @@
|
|||||||
#define dmz_bio_block(bio) dmz_sect2blk((bio)->bi_iter.bi_sector)
|
#define dmz_bio_block(bio) dmz_sect2blk((bio)->bi_iter.bi_sector)
|
||||||
#define dmz_bio_blocks(bio) dmz_sect2blk(bio_sectors(bio))
|
#define dmz_bio_blocks(bio) dmz_sect2blk(bio_sectors(bio))
|
||||||
|
|
||||||
|
struct dmz_metadata;
|
||||||
|
struct dmz_reclaim;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Zoned block device information.
|
* Zoned block device information.
|
||||||
*/
|
*/
|
||||||
struct dmz_dev {
|
struct dmz_dev {
|
||||||
struct block_device *bdev;
|
struct block_device *bdev;
|
||||||
|
struct dmz_metadata *metadata;
|
||||||
|
struct dmz_reclaim *reclaim;
|
||||||
|
|
||||||
char name[BDEVNAME_SIZE];
|
char name[BDEVNAME_SIZE];
|
||||||
|
uuid_t uuid;
|
||||||
|
|
||||||
sector_t capacity;
|
sector_t capacity;
|
||||||
|
|
||||||
|
unsigned int dev_idx;
|
||||||
|
|
||||||
unsigned int nr_zones;
|
unsigned int nr_zones;
|
||||||
|
unsigned int zone_offset;
|
||||||
|
|
||||||
unsigned int flags;
|
unsigned int flags;
|
||||||
|
|
||||||
sector_t zone_nr_sectors;
|
sector_t zone_nr_sectors;
|
||||||
unsigned int zone_nr_sectors_shift;
|
|
||||||
|
|
||||||
sector_t zone_nr_blocks;
|
unsigned int nr_rnd;
|
||||||
sector_t zone_nr_blocks_shift;
|
atomic_t unmap_nr_rnd;
|
||||||
|
struct list_head unmap_rnd_list;
|
||||||
|
struct list_head map_rnd_list;
|
||||||
|
|
||||||
|
unsigned int nr_seq;
|
||||||
|
atomic_t unmap_nr_seq;
|
||||||
|
struct list_head unmap_seq_list;
|
||||||
|
struct list_head map_seq_list;
|
||||||
};
|
};
|
||||||
|
|
||||||
#define dmz_bio_chunk(dev, bio) ((bio)->bi_iter.bi_sector >> \
|
#define dmz_bio_chunk(zmd, bio) ((bio)->bi_iter.bi_sector >> \
|
||||||
(dev)->zone_nr_sectors_shift)
|
dmz_zone_nr_sectors_shift(zmd))
|
||||||
#define dmz_chunk_block(dev, b) ((b) & ((dev)->zone_nr_blocks - 1))
|
#define dmz_chunk_block(zmd, b) ((b) & (dmz_zone_nr_blocks(zmd) - 1))
|
||||||
|
|
||||||
/* Device flags. */
|
/* Device flags. */
|
||||||
#define DMZ_BDEV_DYING (1 << 0)
|
#define DMZ_BDEV_DYING (1 << 0)
|
||||||
#define DMZ_CHECK_BDEV (2 << 0)
|
#define DMZ_CHECK_BDEV (2 << 0)
|
||||||
|
#define DMZ_BDEV_REGULAR (4 << 0)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Zone descriptor.
|
* Zone descriptor.
|
||||||
@ -81,12 +97,18 @@ struct dm_zone {
|
|||||||
/* For listing the zone depending on its state */
|
/* For listing the zone depending on its state */
|
||||||
struct list_head link;
|
struct list_head link;
|
||||||
|
|
||||||
|
/* Device containing this zone */
|
||||||
|
struct dmz_dev *dev;
|
||||||
|
|
||||||
/* Zone type and state */
|
/* Zone type and state */
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
|
|
||||||
/* Zone activation reference count */
|
/* Zone activation reference count */
|
||||||
atomic_t refcount;
|
atomic_t refcount;
|
||||||
|
|
||||||
|
/* Zone id */
|
||||||
|
unsigned int id;
|
||||||
|
|
||||||
/* Zone write pointer block (relative to the zone start block) */
|
/* Zone write pointer block (relative to the zone start block) */
|
||||||
unsigned int wp_block;
|
unsigned int wp_block;
|
||||||
|
|
||||||
@ -109,6 +131,7 @@ struct dm_zone {
|
|||||||
*/
|
*/
|
||||||
enum {
|
enum {
|
||||||
/* Zone write type */
|
/* Zone write type */
|
||||||
|
DMZ_CACHE,
|
||||||
DMZ_RND,
|
DMZ_RND,
|
||||||
DMZ_SEQ,
|
DMZ_SEQ,
|
||||||
|
|
||||||
@ -120,22 +143,28 @@ enum {
|
|||||||
DMZ_META,
|
DMZ_META,
|
||||||
DMZ_DATA,
|
DMZ_DATA,
|
||||||
DMZ_BUF,
|
DMZ_BUF,
|
||||||
|
DMZ_RESERVED,
|
||||||
|
|
||||||
/* Zone internal state */
|
/* Zone internal state */
|
||||||
DMZ_RECLAIM,
|
DMZ_RECLAIM,
|
||||||
DMZ_SEQ_WRITE_ERR,
|
DMZ_SEQ_WRITE_ERR,
|
||||||
|
DMZ_RECLAIM_TERMINATE,
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Zone data accessors.
|
* Zone data accessors.
|
||||||
*/
|
*/
|
||||||
|
#define dmz_is_cache(z) test_bit(DMZ_CACHE, &(z)->flags)
|
||||||
#define dmz_is_rnd(z) test_bit(DMZ_RND, &(z)->flags)
|
#define dmz_is_rnd(z) test_bit(DMZ_RND, &(z)->flags)
|
||||||
#define dmz_is_seq(z) test_bit(DMZ_SEQ, &(z)->flags)
|
#define dmz_is_seq(z) test_bit(DMZ_SEQ, &(z)->flags)
|
||||||
#define dmz_is_empty(z) ((z)->wp_block == 0)
|
#define dmz_is_empty(z) ((z)->wp_block == 0)
|
||||||
#define dmz_is_offline(z) test_bit(DMZ_OFFLINE, &(z)->flags)
|
#define dmz_is_offline(z) test_bit(DMZ_OFFLINE, &(z)->flags)
|
||||||
#define dmz_is_readonly(z) test_bit(DMZ_READ_ONLY, &(z)->flags)
|
#define dmz_is_readonly(z) test_bit(DMZ_READ_ONLY, &(z)->flags)
|
||||||
#define dmz_in_reclaim(z) test_bit(DMZ_RECLAIM, &(z)->flags)
|
#define dmz_in_reclaim(z) test_bit(DMZ_RECLAIM, &(z)->flags)
|
||||||
|
#define dmz_is_reserved(z) test_bit(DMZ_RESERVED, &(z)->flags)
|
||||||
#define dmz_seq_write_err(z) test_bit(DMZ_SEQ_WRITE_ERR, &(z)->flags)
|
#define dmz_seq_write_err(z) test_bit(DMZ_SEQ_WRITE_ERR, &(z)->flags)
|
||||||
|
#define dmz_reclaim_should_terminate(z) \
|
||||||
|
test_bit(DMZ_RECLAIM_TERMINATE, &(z)->flags)
|
||||||
|
|
||||||
#define dmz_is_meta(z) test_bit(DMZ_META, &(z)->flags)
|
#define dmz_is_meta(z) test_bit(DMZ_META, &(z)->flags)
|
||||||
#define dmz_is_buf(z) test_bit(DMZ_BUF, &(z)->flags)
|
#define dmz_is_buf(z) test_bit(DMZ_BUF, &(z)->flags)
|
||||||
@ -158,13 +187,11 @@ enum {
|
|||||||
#define dmz_dev_debug(dev, format, args...) \
|
#define dmz_dev_debug(dev, format, args...) \
|
||||||
DMDEBUG("(%s): " format, (dev)->name, ## args)
|
DMDEBUG("(%s): " format, (dev)->name, ## args)
|
||||||
|
|
||||||
struct dmz_metadata;
|
|
||||||
struct dmz_reclaim;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Functions defined in dm-zoned-metadata.c
|
* Functions defined in dm-zoned-metadata.c
|
||||||
*/
|
*/
|
||||||
int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **zmd);
|
int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev,
|
||||||
|
struct dmz_metadata **zmd, const char *devname);
|
||||||
void dmz_dtr_metadata(struct dmz_metadata *zmd);
|
void dmz_dtr_metadata(struct dmz_metadata *zmd);
|
||||||
int dmz_resume_metadata(struct dmz_metadata *zmd);
|
int dmz_resume_metadata(struct dmz_metadata *zmd);
|
||||||
|
|
||||||
@ -175,23 +202,38 @@ void dmz_unlock_metadata(struct dmz_metadata *zmd);
|
|||||||
void dmz_lock_flush(struct dmz_metadata *zmd);
|
void dmz_lock_flush(struct dmz_metadata *zmd);
|
||||||
void dmz_unlock_flush(struct dmz_metadata *zmd);
|
void dmz_unlock_flush(struct dmz_metadata *zmd);
|
||||||
int dmz_flush_metadata(struct dmz_metadata *zmd);
|
int dmz_flush_metadata(struct dmz_metadata *zmd);
|
||||||
|
const char *dmz_metadata_label(struct dmz_metadata *zmd);
|
||||||
|
|
||||||
unsigned int dmz_id(struct dmz_metadata *zmd, struct dm_zone *zone);
|
|
||||||
sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone);
|
sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone);
|
||||||
sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone);
|
sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone);
|
||||||
unsigned int dmz_nr_chunks(struct dmz_metadata *zmd);
|
unsigned int dmz_nr_chunks(struct dmz_metadata *zmd);
|
||||||
|
|
||||||
#define DMZ_ALLOC_RND 0x01
|
bool dmz_check_dev(struct dmz_metadata *zmd);
|
||||||
#define DMZ_ALLOC_RECLAIM 0x02
|
bool dmz_dev_is_dying(struct dmz_metadata *zmd);
|
||||||
|
|
||||||
struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned long flags);
|
#define DMZ_ALLOC_RND 0x01
|
||||||
|
#define DMZ_ALLOC_CACHE 0x02
|
||||||
|
#define DMZ_ALLOC_SEQ 0x04
|
||||||
|
#define DMZ_ALLOC_RECLAIM 0x10
|
||||||
|
|
||||||
|
struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd,
|
||||||
|
unsigned int dev_idx, unsigned long flags);
|
||||||
void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone);
|
void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone);
|
||||||
|
|
||||||
void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *zone,
|
void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *zone,
|
||||||
unsigned int chunk);
|
unsigned int chunk);
|
||||||
void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone);
|
void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone);
|
||||||
unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd);
|
unsigned int dmz_nr_zones(struct dmz_metadata *zmd);
|
||||||
unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd);
|
unsigned int dmz_nr_cache_zones(struct dmz_metadata *zmd);
|
||||||
|
unsigned int dmz_nr_unmap_cache_zones(struct dmz_metadata *zmd);
|
||||||
|
unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd, int idx);
|
||||||
|
unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd, int idx);
|
||||||
|
unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd, int idx);
|
||||||
|
unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd, int idx);
|
||||||
|
unsigned int dmz_zone_nr_blocks(struct dmz_metadata *zmd);
|
||||||
|
unsigned int dmz_zone_nr_blocks_shift(struct dmz_metadata *zmd);
|
||||||
|
unsigned int dmz_zone_nr_sectors(struct dmz_metadata *zmd);
|
||||||
|
unsigned int dmz_zone_nr_sectors_shift(struct dmz_metadata *zmd);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Activate a zone (increment its reference count).
|
* Activate a zone (increment its reference count).
|
||||||
@ -201,26 +243,10 @@ static inline void dmz_activate_zone(struct dm_zone *zone)
|
|||||||
atomic_inc(&zone->refcount);
|
atomic_inc(&zone->refcount);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Deactivate a zone. This decrement the zone reference counter
|
|
||||||
* indicating that all BIOs to the zone have completed when the count is 0.
|
|
||||||
*/
|
|
||||||
static inline void dmz_deactivate_zone(struct dm_zone *zone)
|
|
||||||
{
|
|
||||||
atomic_dec(&zone->refcount);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Test if a zone is active, that is, has a refcount > 0.
|
|
||||||
*/
|
|
||||||
static inline bool dmz_is_active(struct dm_zone *zone)
|
|
||||||
{
|
|
||||||
return atomic_read(&zone->refcount);
|
|
||||||
}
|
|
||||||
|
|
||||||
int dmz_lock_zone_reclaim(struct dm_zone *zone);
|
int dmz_lock_zone_reclaim(struct dm_zone *zone);
|
||||||
void dmz_unlock_zone_reclaim(struct dm_zone *zone);
|
void dmz_unlock_zone_reclaim(struct dm_zone *zone);
|
||||||
struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd);
|
struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd,
|
||||||
|
unsigned int dev_idx, bool idle);
|
||||||
|
|
||||||
struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd,
|
struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd,
|
||||||
unsigned int chunk, int op);
|
unsigned int chunk, int op);
|
||||||
@ -244,8 +270,7 @@ int dmz_merge_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone,
|
|||||||
/*
|
/*
|
||||||
* Functions defined in dm-zoned-reclaim.c
|
* Functions defined in dm-zoned-reclaim.c
|
||||||
*/
|
*/
|
||||||
int dmz_ctr_reclaim(struct dmz_dev *dev, struct dmz_metadata *zmd,
|
int dmz_ctr_reclaim(struct dmz_metadata *zmd, struct dmz_reclaim **zrc, int idx);
|
||||||
struct dmz_reclaim **zrc);
|
|
||||||
void dmz_dtr_reclaim(struct dmz_reclaim *zrc);
|
void dmz_dtr_reclaim(struct dmz_reclaim *zrc);
|
||||||
void dmz_suspend_reclaim(struct dmz_reclaim *zrc);
|
void dmz_suspend_reclaim(struct dmz_reclaim *zrc);
|
||||||
void dmz_resume_reclaim(struct dmz_reclaim *zrc);
|
void dmz_resume_reclaim(struct dmz_reclaim *zrc);
|
||||||
@ -258,4 +283,22 @@ void dmz_schedule_reclaim(struct dmz_reclaim *zrc);
|
|||||||
bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev);
|
bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev);
|
||||||
bool dmz_check_bdev(struct dmz_dev *dmz_dev);
|
bool dmz_check_bdev(struct dmz_dev *dmz_dev);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Deactivate a zone. This decrement the zone reference counter
|
||||||
|
* indicating that all BIOs to the zone have completed when the count is 0.
|
||||||
|
*/
|
||||||
|
static inline void dmz_deactivate_zone(struct dm_zone *zone)
|
||||||
|
{
|
||||||
|
dmz_reclaim_bio_acc(zone->dev->reclaim);
|
||||||
|
atomic_dec(&zone->refcount);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Test if a zone is active, that is, has a refcount > 0.
|
||||||
|
*/
|
||||||
|
static inline bool dmz_is_active(struct dm_zone *zone)
|
||||||
|
{
|
||||||
|
return atomic_read(&zone->refcount);
|
||||||
|
}
|
||||||
|
|
||||||
#endif /* DM_ZONED_H */
|
#endif /* DM_ZONED_H */
|
||||||
|
@ -676,6 +676,15 @@ static bool md_in_flight(struct mapped_device *md)
|
|||||||
return md_in_flight_bios(md);
|
return md_in_flight_bios(md);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
u64 dm_start_time_ns_from_clone(struct bio *bio)
|
||||||
|
{
|
||||||
|
struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
|
||||||
|
struct dm_io *io = tio->io;
|
||||||
|
|
||||||
|
return jiffies_to_nsecs(io->start_time);
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
|
||||||
|
|
||||||
static void start_io_acct(struct dm_io *io)
|
static void start_io_acct(struct dm_io *io)
|
||||||
{
|
{
|
||||||
struct mapped_device *md = io->md;
|
struct mapped_device *md = io->md;
|
||||||
@ -2610,7 +2619,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
|
|||||||
if (noflush)
|
if (noflush)
|
||||||
set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
|
set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
|
||||||
else
|
else
|
||||||
pr_debug("%s: suspending with flush\n", dm_device_name(md));
|
DMDEBUG("%s: suspending with flush", dm_device_name(md));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This gets reverted if there's an error later and the targets
|
* This gets reverted if there's an error later and the targets
|
||||||
|
@ -38,7 +38,7 @@ struct node_header {
|
|||||||
|
|
||||||
struct btree_node {
|
struct btree_node {
|
||||||
struct node_header header;
|
struct node_header header;
|
||||||
__le64 keys[0];
|
__le64 keys[];
|
||||||
} __packed;
|
} __packed;
|
||||||
|
|
||||||
|
|
||||||
@ -68,7 +68,7 @@ struct ro_spine {
|
|||||||
};
|
};
|
||||||
|
|
||||||
void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info);
|
void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info);
|
||||||
int exit_ro_spine(struct ro_spine *s);
|
void exit_ro_spine(struct ro_spine *s);
|
||||||
int ro_step(struct ro_spine *s, dm_block_t new_child);
|
int ro_step(struct ro_spine *s, dm_block_t new_child);
|
||||||
void ro_pop(struct ro_spine *s);
|
void ro_pop(struct ro_spine *s);
|
||||||
struct btree_node *ro_node(struct ro_spine *s);
|
struct btree_node *ro_node(struct ro_spine *s);
|
||||||
|
@ -132,15 +132,13 @@ void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info)
|
|||||||
s->nodes[1] = NULL;
|
s->nodes[1] = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
int exit_ro_spine(struct ro_spine *s)
|
void exit_ro_spine(struct ro_spine *s)
|
||||||
{
|
{
|
||||||
int r = 0, i;
|
int i;
|
||||||
|
|
||||||
for (i = 0; i < s->count; i++) {
|
for (i = 0; i < s->count; i++) {
|
||||||
unlock_block(s->info, s->nodes[i]);
|
unlock_block(s->info, s->nodes[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
return r;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int ro_step(struct ro_spine *s, dm_block_t new_child)
|
int ro_step(struct ro_spine *s, dm_block_t new_child)
|
||||||
|
@ -332,6 +332,8 @@ void *dm_per_bio_data(struct bio *bio, size_t data_size);
|
|||||||
struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size);
|
struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size);
|
||||||
unsigned dm_bio_get_target_bio_nr(const struct bio *bio);
|
unsigned dm_bio_get_target_bio_nr(const struct bio *bio);
|
||||||
|
|
||||||
|
u64 dm_start_time_ns_from_clone(struct bio *bio);
|
||||||
|
|
||||||
int dm_register_target(struct target_type *t);
|
int dm_register_target(struct target_type *t);
|
||||||
void dm_unregister_target(struct target_type *t);
|
void dm_unregister_target(struct target_type *t);
|
||||||
|
|
||||||
@ -557,13 +559,8 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
|
|||||||
#define DMINFO(fmt, ...) pr_info(DM_FMT(fmt), ##__VA_ARGS__)
|
#define DMINFO(fmt, ...) pr_info(DM_FMT(fmt), ##__VA_ARGS__)
|
||||||
#define DMINFO_LIMIT(fmt, ...) pr_info_ratelimited(DM_FMT(fmt), ##__VA_ARGS__)
|
#define DMINFO_LIMIT(fmt, ...) pr_info_ratelimited(DM_FMT(fmt), ##__VA_ARGS__)
|
||||||
|
|
||||||
#ifdef CONFIG_DM_DEBUG
|
#define DMDEBUG(fmt, ...) pr_debug(DM_FMT(fmt), ##__VA_ARGS__)
|
||||||
#define DMDEBUG(fmt, ...) printk(KERN_DEBUG DM_FMT(fmt), ##__VA_ARGS__)
|
|
||||||
#define DMDEBUG_LIMIT(fmt, ...) pr_debug_ratelimited(DM_FMT(fmt), ##__VA_ARGS__)
|
#define DMDEBUG_LIMIT(fmt, ...) pr_debug_ratelimited(DM_FMT(fmt), ##__VA_ARGS__)
|
||||||
#else
|
|
||||||
#define DMDEBUG(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
|
|
||||||
#define DMDEBUG_LIMIT(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define DMEMIT(x...) sz += ((sz >= maxlen) ? \
|
#define DMEMIT(x...) sz += ((sz >= maxlen) ? \
|
||||||
0 : scnprintf(result + sz, maxlen - sz, x))
|
0 : scnprintf(result + sz, maxlen - sz, x))
|
||||||
|
@ -118,6 +118,11 @@ int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c);
|
|||||||
*/
|
*/
|
||||||
int dm_bufio_issue_flush(struct dm_bufio_client *c);
|
int dm_bufio_issue_flush(struct dm_bufio_client *c);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Send a discard request to the underlying device.
|
||||||
|
*/
|
||||||
|
int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Like dm_bufio_release but also move the buffer to the new
|
* Like dm_bufio_release but also move the buffer to the new
|
||||||
* block. dm_bufio_write_dirty_buffers is needed to commit the new block.
|
* block. dm_bufio_write_dirty_buffers is needed to commit the new block.
|
||||||
@ -131,6 +136,13 @@ void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block);
|
|||||||
*/
|
*/
|
||||||
void dm_bufio_forget(struct dm_bufio_client *c, sector_t block);
|
void dm_bufio_forget(struct dm_bufio_client *c, sector_t block);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Free the given range of buffers.
|
||||||
|
* This is just a hint, if the buffer is in use or dirty, this function
|
||||||
|
* does nothing.
|
||||||
|
*/
|
||||||
|
void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Set the minimum number of buffers before cleanup happens.
|
* Set the minimum number of buffers before cleanup happens.
|
||||||
*/
|
*/
|
||||||
|
Loading…
Reference in New Issue
Block a user