loop-util: rework how we lock loopback block devices

Let's rework how we lock loopback block devices in two ways:

1. Lock a separate fd, instead of the main block device fd. We already
   did that for our internal locking when allocating loopback block
   devices, but do so for the exposed locking (i.e.
   loop_device_flock()), too, so that the lock is independent of the
   main fd we actually use of IO.

2. Instead of locking the device during allocation of the loopback
   device, then unlocking it (which will make udev run), and then
   re-locking things if we need, let's instead just keep the lock the
   whole time, to make things a bit safer and faster, and not have to
   wait for udev at all. This is done by adding a "lock_op" parameter to
   loop device allocation functions that declares the initial state of
   the lock, and is one of LOCK_UN/LOCK_SH/LOCK_EX. This change also
   shortens a lot of code, since we allocate + immediately lock loopback
   devices pretty much everywhere.
This commit is contained in:
Lennart Poettering 2022-09-01 15:00:30 +02:00
parent 234c2e16e5
commit 7f52206a2b
12 changed files with 105 additions and 97 deletions

View File

@ -2058,16 +2058,11 @@ int setup_namespace(
root_image,
FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : -1 /* < 0 means writable if possible, read-only as fallback */,
FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
LOCK_SH,
&loop_device);
if (r < 0)
return log_debug_errno(r, "Failed to create loop device for root image: %m");
/* Make sure udevd won't issue BLKRRPART (which might flush out the loaded partition table)
* while we are still trying to mount things */
r = loop_device_flock(loop_device, LOCK_SH);
if (r < 0)
return log_debug_errno(r, "Failed to lock loopback device with LOCK_SH: %m");
r = dissect_image(
loop_device->fd,
&verity,

View File

@ -885,14 +885,10 @@ static int action_umount(const char *path) {
return log_error_errno(r, "Failed to get devname of block device " DEVNUM_FORMAT_STR ": %m",
DEVNUM_FORMAT_VAL(devno));
r = loop_device_open(devname, 0, &d);
r = loop_device_open(devname, 0, LOCK_EX, &d);
if (r < 0)
return log_error_errno(r, "Failed to open loop device '%s': %m", devname);
r = loop_device_flock(d, LOCK_EX);
if (r < 0)
return log_error_errno(r, "Failed to lock loop device '%s': %m", devname);
/* We've locked the loop device, now we're ready to unmount. To allow the unmount to succeed, we have
* to close the O_PATH fd we opened earlier. */
fd = safe_close(fd);
@ -944,16 +940,11 @@ static int run(int argc, char *argv[]) {
arg_image,
FLAGS_SET(arg_flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : O_RDWR,
FLAGS_SET(arg_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
LOCK_SH,
&d);
if (r < 0)
return log_error_errno(r, "Failed to set up loopback device for %s: %m", arg_image);
/* Make sure udevd doesn't issue BLKRRPART underneath us thus making devices disappear in the middle,
* that we assume already are there. */
r = loop_device_flock(d, LOCK_SH);
if (r < 0)
return log_error_errno(r, "Failed to lock loopback device: %m");
r = dissect_image_and_warn(
d->fd,
arg_image,

View File

@ -1284,7 +1284,7 @@ int home_setup_luks(
return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine backing device for DM %s.", setup->dm_name);
if (!setup->loop) {
r = loop_device_open(n, O_RDWR, &setup->loop);
r = loop_device_open(n, O_RDWR, LOCK_UN, &setup->loop);
if (r < 0)
return log_error_errno(r, "Failed to open loopback device %s: %m", n);
}
@ -1378,7 +1378,7 @@ int home_setup_luks(
return r;
}
r = loop_device_make(setup->image_fd, O_RDWR, offset, size, 0, &setup->loop);
r = loop_device_make(setup->image_fd, O_RDWR, offset, size, 0, LOCK_UN, &setup->loop);
if (r == -ENOENT) {
log_error_errno(r, "Loopback block device support is not available on this system.");
return -ENOLINK; /* make recognizable */
@ -2299,7 +2299,7 @@ int home_create_luks(
log_info("Writing of partition table completed.");
r = loop_device_make(setup->image_fd, O_RDWR, partition_offset, partition_size, 0, &setup->loop);
r = loop_device_make(setup->image_fd, O_RDWR, partition_offset, partition_size, 0, LOCK_EX, &setup->loop);
if (r < 0) {
if (r == -ENOENT) { /* this means /dev/loop-control doesn't exist, i.e. we are in a container
* or similar and loopback bock devices are not available, return a
@ -2311,10 +2311,6 @@ int home_create_luks(
return log_error_errno(r, "Failed to set up loopback device for %s: %m", setup->temporary_image_path);
}
r = loop_device_flock(setup->loop, LOCK_EX); /* make sure udev won't read before we are done */
if (r < 0)
return log_error_errno(r, "Failed to take lock on loop device: %m");
log_info("Setting up loopback device %s completed.", setup->loop->node ?: ip);
r = luks_format(setup->loop->node,

View File

@ -5746,19 +5746,13 @@ static int run(int argc, char *argv[]) {
arg_image,
arg_read_only ? O_RDONLY : O_RDWR,
FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
LOCK_SH,
&loop);
if (r < 0) {
log_error_errno(r, "Failed to set up loopback block device: %m");
goto finish;
}
/* Take a LOCK_SH lock on the device, so that udevd doesn't issue BLKRRPART in our back */
r = loop_device_flock(loop, LOCK_SH);
if (r < 0) {
log_error_errno(r, "Failed to take lock on loopback block device: %m");
goto finish;
}
r = dissect_image_and_warn(
loop->fd,
arg_image,

View File

@ -2789,14 +2789,10 @@ static int context_copy_blocks(Context *context) {
assert_se((whole_fd = fdisk_get_devfd(context->fdisk_context)) >= 0);
if (p->encrypt != ENCRYPT_OFF) {
r = loop_device_make(whole_fd, O_RDWR, p->offset, p->new_size, 0, &d);
r = loop_device_make(whole_fd, O_RDWR, p->offset, p->new_size, 0, LOCK_EX, &d);
if (r < 0)
return log_error_errno(r, "Failed to make loopback device of future partition %" PRIu64 ": %m", p->partno);
r = loop_device_flock(d, LOCK_EX);
if (r < 0)
return log_error_errno(r, "Failed to lock loopback device: %m");
r = partition_encrypt(context, p, d->node, &cd, &encrypted, &encrypted_dev_fd);
if (r < 0)
return log_error_errno(r, "Failed to encrypt device: %m");
@ -3038,14 +3034,10 @@ static int context_mkfs(Context *context) {
/* Loopback block devices are not only useful to turn regular files into block devices, but
* also to cut out sections of block devices into new block devices. */
r = loop_device_make(fd, O_RDWR, p->offset, p->new_size, 0, &d);
r = loop_device_make(fd, O_RDWR, p->offset, p->new_size, 0, LOCK_EX, &d);
if (r < 0)
return log_error_errno(r, "Failed to make loopback device of future partition %" PRIu64 ": %m", p->partno);
r = loop_device_flock(d, LOCK_EX);
if (r < 0)
return log_error_errno(r, "Failed to lock loopback device: %m");
if (p->encrypt != ENCRYPT_OFF) {
r = partition_encrypt(context, p, d->node, &cd, &encrypted, &encrypted_dev_fd);
if (r < 0)

View File

@ -333,7 +333,7 @@ static int portable_extract_by_path(
assert(path);
r = loop_device_make_by_path(path, O_RDONLY, LO_FLAGS_PARTSCAN, &d);
r = loop_device_make_by_path(path, O_RDONLY, LO_FLAGS_PARTSCAN, LOCK_SH, &d);
if (r == -EISDIR) {
_cleanup_free_ char *image_name = NULL;
@ -359,10 +359,6 @@ static int portable_extract_by_path(
/* We now have a loopback block device, let's fork off a child in its own mount namespace, mount it
* there, and extract the metadata we need. The metadata is sent from the child back to us. */
r = loop_device_flock(d, LOCK_SH);
if (r < 0)
return log_debug_errno(r, "Failed to acquire lock on loopback block device: %m");
BLOCK_SIGNALS(SIGCHLD);
r = mkdtemp_malloc("/tmp/inspect-XXXXXX", &tmpdir);

View File

@ -1192,13 +1192,7 @@ int image_read_metadata(Image *i) {
_cleanup_(loop_device_unrefp) LoopDevice *d = NULL;
_cleanup_(dissected_image_unrefp) DissectedImage *m = NULL;
r = loop_device_make_by_path(i->path, O_RDONLY, LO_FLAGS_PARTSCAN, &d);
if (r < 0)
return r;
/* Make sure udevd doesn't issue BLKRRPART in the background which might make our partitions
* disappear temporarily. */
r = loop_device_flock(d, LOCK_SH);
r = loop_device_make_by_path(i->path, O_RDONLY, LO_FLAGS_PARTSCAN, LOCK_SH, &d);
if (r < 0)
return r;

View File

@ -2966,15 +2966,11 @@ int mount_image_privately_interactively(
image,
FLAGS_SET(flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : O_RDWR,
FLAGS_SET(flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
LOCK_SH,
&d);
if (r < 0)
return log_error_errno(r, "Failed to set up loopback device for %s: %m", image);
/* Make sure udevd doesn't issue BLKRRPART behind our backs */
r = loop_device_flock(d, LOCK_SH);
if (r < 0)
return r;
r = dissect_image_and_warn(d->fd, image, &verity, NULL, d->diskseq, d->uevent_seqnum_not_before, d->timestamp_not_before, flags, &dissected_image);
if (r < 0)
return r;
@ -3081,14 +3077,11 @@ int verity_dissect_and_mount(
src_fd >= 0 ? FORMAT_PROC_FD_PATH(src_fd) : src,
-1,
verity.data_path ? 0 : LO_FLAGS_PARTSCAN,
LOCK_SH,
&loop_device);
if (r < 0)
return log_debug_errno(r, "Failed to create loop device for image: %m");
r = loop_device_flock(loop_device, LOCK_SH);
if (r < 0)
return log_debug_errno(r, "Failed to lock loop device: %m");
r = dissect_image(
loop_device->fd,
&verity,

View File

@ -117,13 +117,28 @@ static int device_has_block_children(sd_device *d) {
return !!sd_device_enumerator_get_device_first(e);
}
static int open_lock_fd(int primary_fd, int operation) {
int lock_fd;
assert(primary_fd >= 0);
lock_fd = fd_reopen(primary_fd, O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
if (lock_fd < 0)
return lock_fd;
if (flock(lock_fd, operation) < 0)
return -errno;
return lock_fd;
}
static int loop_configure(
int fd,
int nr,
const struct loop_config *c,
bool *try_loop_configure,
uint64_t *ret_seqnum_not_before,
usec_t *ret_timestamp_not_before) {
usec_t *ret_timestamp_not_before,
int *ret_lock_fd) {
_cleanup_(sd_device_unrefp) sd_device *d = NULL;
_cleanup_free_ char *sysname = NULL;
@ -152,11 +167,9 @@ static int loop_configure(
* long time udev would possibly never run on it again, even though the fd is unlocked, simply
* because we never close() it. It also has the nice benefit we can use the _cleanup_close_ logic to
* automatically release the lock, after we are done. */
lock_fd = fd_reopen(fd, O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
lock_fd = open_lock_fd(fd, LOCK_EX);
if (lock_fd < 0)
return lock_fd;
if (flock(lock_fd, LOCK_EX) < 0)
return -errno;
/* Let's see if the device is really detached, i.e. currently has no associated partition block
* devices. On various kernels (such as 5.8) it is possible to have a loopback block device that
@ -247,12 +260,7 @@ static int loop_configure(
goto fail;
}
if (ret_seqnum_not_before)
*ret_seqnum_not_before = seqnum;
if (ret_timestamp_not_before)
*ret_timestamp_not_before = timestamp;
return 0;
goto success;
}
}
@ -317,10 +325,13 @@ static int loop_configure(
log_debug_errno(errno, "Failed to enable direct IO mode on loopback device /dev/loop%i, ignoring: %m", nr);
}
success:
if (ret_seqnum_not_before)
*ret_seqnum_not_before = seqnum;
if (ret_timestamp_not_before)
*ret_timestamp_not_before = timestamp;
if (ret_lock_fd)
*ret_lock_fd = TAKE_FD(lock_fd);
return 0;
@ -373,9 +384,10 @@ static int loop_device_make_internal(
uint64_t offset,
uint64_t size,
uint32_t loop_flags,
int lock_op,
LoopDevice **ret) {
_cleanup_close_ int direct_io_fd = -1;
_cleanup_close_ int direct_io_fd = -1, lock_fd = -1;
_cleanup_free_ char *loopdev = NULL;
bool try_loop_configure = true;
struct loop_config config;
@ -393,6 +405,12 @@ static int loop_device_make_internal(
return -errno;
if (S_ISBLK(st.st_mode)) {
if (lock_op != LOCK_UN) {
lock_fd = open_lock_fd(fd, lock_op);
if (lock_fd < 0)
return lock_fd;
}
if (ioctl(fd, LOOP_GET_STATUS64, &config.info) >= 0) {
/* Oh! This is a loopback device? That's interesting! */
@ -430,6 +448,7 @@ static int loop_device_make_internal(
return -ENOMEM;
*d = (LoopDevice) {
.fd = TAKE_FD(copy),
.lock_fd = TAKE_FD(lock_fd),
.nr = nr,
.node = TAKE_PTR(loopdev),
.relinquished = true, /* It's not allocated by us, don't destroy it when this object is freed */
@ -519,7 +538,7 @@ static int loop_device_make_internal(
if (!ERRNO_IS_DEVICE_ABSENT(errno))
return -errno;
} else {
r = loop_configure(loop, nr, &config, &try_loop_configure, &seqnum, &timestamp);
r = loop_configure(loop, nr, &config, &try_loop_configure, &seqnum, &timestamp, &lock_fd);
if (r >= 0) {
loop_with_fd = TAKE_FD(loop);
break;
@ -583,11 +602,26 @@ static int loop_device_make_internal(
if (r < 0 && r != -EOPNOTSUPP)
return r;
switch (lock_op & ~LOCK_NB) {
case LOCK_EX: /* Already in effect */
break;
case LOCK_SH: /* Downgrade */
if (flock(lock_fd, lock_op) < 0)
return -errno;
break;
case LOCK_UN: /* Release */
lock_fd = safe_close(lock_fd);
break;
default:
assert_not_reached();
}
d = new(LoopDevice, 1);
if (!d)
return -ENOMEM;
*d = (LoopDevice) {
.fd = TAKE_FD(loop_with_fd),
.lock_fd = TAKE_FD(lock_fd),
.node = TAKE_PTR(loopdev),
.nr = nr,
.devno = st.st_rdev,
@ -622,6 +656,7 @@ int loop_device_make(
uint64_t offset,
uint64_t size,
uint32_t loop_flags,
int lock_op,
LoopDevice **ret) {
assert(fd >= 0);
@ -633,6 +668,7 @@ int loop_device_make(
offset,
size,
loop_flags_mangle(loop_flags),
lock_op,
ret);
}
@ -640,6 +676,7 @@ int loop_device_make_by_path(
const char *path,
int open_flags,
uint32_t loop_flags,
int lock_op,
LoopDevice **ret) {
int r, basic_flags, direct_flags, rdwr_flags;
@ -693,7 +730,7 @@ int loop_device_make_by_path(
direct ? "enabled" : "disabled",
direct != (direct_flags != 0) ? " (O_DIRECT was requested but not supported)" : "");
return loop_device_make_internal(fd, open_flags, 0, 0, loop_flags, ret);
return loop_device_make_internal(fd, open_flags, 0, 0, loop_flags, lock_op, ret);
}
LoopDevice* loop_device_unref(LoopDevice *d) {
@ -702,6 +739,8 @@ LoopDevice* loop_device_unref(LoopDevice *d) {
if (!d)
return NULL;
d->lock_fd = safe_close(d->lock_fd);
if (d->fd >= 0) {
/* Implicitly sync the device, since otherwise in-flight blocks might not get written */
if (fsync(d->fd) < 0)
@ -768,8 +807,13 @@ void loop_device_unrelinquish(LoopDevice *d) {
d->relinquished = false;
}
int loop_device_open(const char *loop_path, int open_flags, LoopDevice **ret) {
_cleanup_close_ int loop_fd = -1;
int loop_device_open(
const char *loop_path,
int open_flags,
int lock_op,
LoopDevice **ret) {
_cleanup_close_ int loop_fd = -1, lock_fd = -1;
_cleanup_free_ char *p = NULL;
struct loop_info64 info;
struct stat st;
@ -798,6 +842,12 @@ int loop_device_open(const char *loop_path, int open_flags, LoopDevice **ret) {
} else
nr = -1;
if ((lock_op & ~LOCK_NB) != LOCK_UN) {
lock_fd = open_lock_fd(loop_fd, lock_op);
if (lock_fd < 0)
return lock_fd;
}
p = strdup(loop_path);
if (!p)
return -ENOMEM;
@ -808,6 +858,7 @@ int loop_device_open(const char *loop_path, int open_flags, LoopDevice **ret) {
*d = (LoopDevice) {
.fd = TAKE_FD(loop_fd),
.lock_fd = TAKE_FD(lock_fd),
.nr = nr,
.node = TAKE_PTR(p),
.relinquished = true, /* It's not ours, don't try to destroy it when this object is freed */
@ -929,12 +980,28 @@ int loop_device_refresh_size(LoopDevice *d, uint64_t offset, uint64_t size) {
}
int loop_device_flock(LoopDevice *d, int operation) {
assert(IN_SET(operation & ~LOCK_NB, LOCK_UN, LOCK_SH, LOCK_EX));
assert(d);
if (d->fd < 0)
return -EBADF;
/* When unlocking just close the lock fd */
if ((operation & ~LOCK_NB) == LOCK_UN) {
d->lock_fd = safe_close(d->lock_fd);
return 0;
}
return RET_NERRNO(flock(d->fd, operation));
/* If we had no lock fd so far, create one and lock it right-away */
if (d->lock_fd < 0) {
assert(d->fd >= 0);
d->lock_fd = open_lock_fd(d->fd, operation);
if (d->lock_fd < 0)
return d->lock_fd;
return 0;
}
/* Otherwise change the current lock mode on the existing fd */
return RET_NERRNO(flock(d->lock_fd, operation));
}
int loop_device_sync(LoopDevice *d) {

View File

@ -10,6 +10,7 @@ typedef struct LoopDevice LoopDevice;
struct LoopDevice {
int fd;
int lock_fd;
int nr;
dev_t devno;
char *node;
@ -19,9 +20,9 @@ struct LoopDevice {
usec_t timestamp_not_before; /* CLOCK_MONOTONIC timestamp taken immediately before attaching the loopback device, or USEC_INFINITY if we don't know */
};
int loop_device_make(int fd, int open_flags, uint64_t offset, uint64_t size, uint32_t loop_flags, LoopDevice **ret);
int loop_device_make_by_path(const char *path, int open_flags, uint32_t loop_flags, LoopDevice **ret);
int loop_device_open(const char *loop_path, int open_flags, LoopDevice **ret);
int loop_device_make(int fd, int open_flags, uint64_t offset, uint64_t size, uint32_t loop_flags, int lock_op, LoopDevice **ret);
int loop_device_make_by_path(const char *path, int open_flags, uint32_t loop_flags, int lock_op, LoopDevice **ret);
int loop_device_open(const char *loop_path, int open_flags, int lock_op, LoopDevice **ret);
LoopDevice* loop_device_unref(LoopDevice *d);
DEFINE_TRIVIAL_CLEANUP_FUNC(LoopDevice*, loop_device_unref);

View File

@ -534,14 +534,11 @@ static int merge_subprocess(Hashmap *images, const char *workspace) {
img->path,
O_RDONLY,
FLAGS_SET(flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
LOCK_SH,
&d);
if (r < 0)
return log_error_errno(r, "Failed to set up loopback device for %s: %m", img->path);
r = loop_device_flock(d, LOCK_SH);
if (r < 0)
return log_error_errno(r, "Failed to lock loopback device: %m");
r = dissect_image_and_warn(
d->fd,
img->path,

View File

@ -49,16 +49,13 @@ static void* thread_func(void *ptr) {
assert_se(mkdtemp_malloc(NULL, &mounted) >= 0);
r = loop_device_make(fd, O_RDONLY, 0, UINT64_MAX, LO_FLAGS_PARTSCAN, &loop);
r = loop_device_make(fd, O_RDONLY, 0, UINT64_MAX, LO_FLAGS_PARTSCAN, LOCK_SH, &loop);
if (r < 0)
log_error_errno(r, "Failed to allocate loopback device: %m");
assert_se(r >= 0);
log_notice("Acquired loop device %s, will mount on %s", loop->node, mounted);
/* Let's make sure udev doesn't call BLKRRPART in the background, while we try to mount the device. */
assert_se(loop_device_flock(loop, LOCK_SH) >= 0);
r = dissect_image(loop->fd, NULL, NULL, loop->diskseq, loop->uevent_seqnum_not_before, loop->timestamp_not_before, DISSECT_IMAGE_READ_ONLY, &dissected);
if (r < 0)
log_error_errno(r, "Failed dissect loopback device %s: %m", loop->node);
@ -215,7 +212,7 @@ static int run(int argc, char *argv[]) {
assert_se(pclose(sfdisk) == 0);
sfdisk = NULL;
assert_se(loop_device_make(fd, O_RDWR, 0, UINT64_MAX, LO_FLAGS_PARTSCAN, &loop) >= 0);
assert_se(loop_device_make(fd, O_RDWR, 0, UINT64_MAX, LO_FLAGS_PARTSCAN, LOCK_EX, &loop) >= 0);
#if HAVE_BLKID
_cleanup_(dissected_image_unrefp) DissectedImage *dissected = NULL;
@ -223,11 +220,6 @@ static int run(int argc, char *argv[]) {
pthread_t threads[arg_n_threads];
sd_id128_t id;
/* Take an explicit lock while we format the file systems, in accordance with
* https://systemd.io/BLOCK_DEVICE_LOCKING/. We don't want udev to interfere and probe while we write
* or even issue BLKRRPART or similar while we are working on this. */
assert_se(loop_device_flock(loop, LOCK_EX) >= 0);
assert_se(dissect_image(loop->fd, NULL, NULL, loop->diskseq, loop->uevent_seqnum_not_before, loop->timestamp_not_before, 0, &dissected) >= 0);
assert_se(dissected->partitions[PARTITION_ESP].found);