nspawn: several follow-ups for recent changes (#35146)

Closes #35116.
This commit is contained in:
Yu Watanabe 2024-11-15 00:12:40 +09:00 committed by GitHub
commit 3ea89c64c8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 187 additions and 85 deletions

View File

@ -2211,7 +2211,8 @@ static bool should_enable_fuse(void) {
else if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
log_debug_errno(r, "Disabling FUSE: Kernel does not support the fsopen() family of syscalls: %m");
else
log_warning_errno(r, "Disabling FUSE: Failed to determine FUSE version: %m");
log_full_errno(ERRNO_IS_NEG_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
"Disabling FUSE: Failed to determine FUSE version: %m");
return false;
}
@ -2226,97 +2227,144 @@ static bool should_enable_fuse(void) {
return true;
}
static int bind_mount_devnode(const char *from, const char *to) {
int r;
assert(from);
assert(to);
r = touch(to);
if (r < 0)
return log_debug_errno(r, "Failed to touch %s: %m", to);
r = mount_nofollow_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
if (r < 0) {
(void) unlink(to);
return log_error_errno(r, "Failed to bind mount %s to %s: %m", from, to);
}
return 0;
}
static int copy_devnode_one(const char *dest, const char *node, bool ignore_mknod_failure) {
int r;
assert(dest);
assert(!isempty(node));
BLOCK_WITH_UMASK(0000);
_cleanup_free_ char *from = path_join("/dev/", node);
if (!from)
return log_oom();
_cleanup_free_ char *to = path_join(dest, from);
if (!to)
return log_oom();
struct stat st;
if (stat(from, &st) < 0) {
if (errno != ENOENT)
return log_error_errno(errno, "Failed to stat %s: %m", from);
log_debug_errno(errno, "Device node %s does not exist, ignoring.", from);
return 0;
}
if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
return log_error_errno(SYNTHETIC_ERRNO(ESTALE), "%s is not a device node.", from);
/* Create the parent directory of the device node. Here, we assume that the path has at most one
* subdirectory under /dev/, e.g. /dev/net/tun. */
_cleanup_free_ char *parent = NULL;
r = path_extract_directory(from, &parent);
if (r < 0)
return log_error_errno(r, "Failed to extract directory from %s: %m", from);
if (!path_equal(parent, "/dev/")) {
if (userns_mkdir(dest, parent, 0755, 0, 0) < 0)
return log_error_errno(r, "Failed to create directory %s: %m", parent);
}
if (mknod(to, st.st_mode, st.st_rdev) < 0) {
r = -errno; /* Save the original error code. */
/* Explicitly warn the user when /dev/ is already populated. */
if (r == -EEXIST)
log_notice("%s/dev/ is pre-mounted and pre-populated. If a pre-mounted /dev/ is provided it needs to be an unpopulated file system.", dest);
/* If arg_uid_shift != 0, then we cannot fall back to use bind mount. */
if (arg_uid_shift != 0) {
if (ignore_mknod_failure) {
log_debug_errno(r, "Failed to mknod(%s), ignoring: %m", to);
return 0;
}
return log_error_errno(r, "Failed to mknod(%s): %m", to);
}
/* Some systems abusively restrict mknod but allow bind mounts. */
if (bind_mount_devnode(from, to) < 0) {
/* use the original error code. */
if (ignore_mknod_failure) {
log_debug_errno(r, "Both mknod() and bind mount %s failed, ignoring: %m", to);
return 0;
}
return log_error_errno(r, "Both mknod() and bind mount %s failed: %m", to);
}
} else {
/* mknod() succeeds, chown() it if necessary. */
r = userns_lchown(to, 0, 0);
if (r < 0)
return log_error_errno(r, "chown() of device node %s failed: %m", to);
}
_cleanup_free_ char *dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
if (!dn)
return log_oom();
r = userns_mkdir(dest, dn, 0755, 0, 0);
if (r < 0)
return log_error_errno(r, "Failed to create '%s': %m", dn);
_cleanup_free_ char *sl = NULL;
if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
return log_oom();
_cleanup_free_ char *prefixed = path_join(dest, sl);
if (!prefixed)
return log_oom();
_cleanup_free_ char *t = path_join("..", node);
if (!t)
return log_oom();
if (symlink(t, prefixed) < 0)
log_debug_errno(errno, "Failed to symlink '%s' to '%s', ignoring: %m", t, prefixed);
return 0;
}
static int copy_devnodes(const char *dest, bool enable_fuse) {
_cleanup_strv_free_ char **devnodes = NULL;
int r = 0;
assert(dest);
devnodes = strv_new("null",
"zero",
"full",
"random",
"urandom",
"tty",
STRV_IFNOTNULL(enable_fuse ? "fuse" : NULL),
"net/tun");
if (!devnodes)
return log_oom();
BLOCK_WITH_UMASK(0000);
/* Create /dev/net, so that we can create /dev/net/tun in it */
if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
return log_error_errno(r, "Failed to create /dev/net directory: %m");
STRV_FOREACH(d, devnodes) {
_cleanup_free_ char *from = NULL, *to = NULL;
struct stat st;
from = path_join("/dev/", *d);
if (!from)
return log_oom();
to = path_join(dest, from);
if (!to)
return log_oom();
if (stat(from, &st) < 0) {
if (errno != ENOENT)
return log_error_errno(errno, "Failed to stat %s: %m", from);
} else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
return log_error_errno(SYNTHETIC_ERRNO(EIO),
"%s is not a char or block device, cannot copy.", from);
else {
_cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
if (mknod(to, st.st_mode, st.st_rdev) < 0) {
/* Explicitly warn the user when /dev is already populated. */
if (errno == EEXIST)
log_notice("%s/dev/ is pre-mounted and pre-populated. If a pre-mounted /dev/ is provided it needs to be an unpopulated file system.", dest);
if (!ERRNO_IS_PRIVILEGE(errno) || arg_uid_shift != 0)
return log_error_errno(errno, "mknod(%s) failed: %m", to);
/* Some systems abusively restrict mknod but allow bind mounts. */
r = touch(to);
if (r < 0)
return log_error_errno(r, "touch (%s) failed: %m", to);
r = mount_nofollow_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
if (r < 0)
return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
} else {
r = userns_lchown(to, 0, 0);
if (r < 0)
return log_error_errno(r, "chown() of device node %s failed: %m", to);
}
dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
if (!dn)
return log_oom();
r = userns_mkdir(dest, dn, 0755, 0, 0);
if (r < 0)
return log_error_errno(r, "Failed to create '%s': %m", dn);
if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
return log_oom();
prefixed = path_join(dest, sl);
if (!prefixed)
return log_oom();
t = path_join("..", *d);
if (!t)
return log_oom();
if (symlink(t, prefixed) < 0)
log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
}
FOREACH_STRING(node, "null", "zero", "full", "random", "urandom", "tty") {
r = copy_devnode_one(dest, node, /* ignore_mknod_failure = */ false);
if (r < 0)
return r;
}
return r;
if (enable_fuse) {
r = copy_devnode_one(dest, "fuse", /* ignore_mknod_failure = */ false);
if (r < 0)
return r;
}
/* We unconditionally try to create /dev/net/tun, but let's ignore failure if --private-network is
* unspecified. The failure can be triggered when e.g. DevicePolicy= is set, but DeviceAllow= does
* not contains the device node, and --private-users=pick is specified. */
r = copy_devnode_one(dest, "net/tun", /* ignore_mknod_failure = */ !arg_private_network);
if (r < 0)
return r;
return 0;
}
static int make_extra_nodes(const char *dest) {

View File

@ -19,6 +19,8 @@ test_append_files() {
instmods mac80211_hwsim
# for IPMasquerade=
instmods "=net/netfilter"
# For /dev/net/tun
instmods tun
generate_module_dependencies
# For unprivileged mountfsd.
if command -v openssl >/dev/null 2>&1; then

View File

@ -1214,4 +1214,56 @@ testcase_unpriv_fuse() {
bash -c 'cat <>/dev/fuse' 2>&1)" == *'cat: -: Operation not permitted' ]]
}
test_tun() {
local expect=${1?}
local exists=${2?}
local command command_exists command_not_exists
shift 2
command_exists='[[ -c /dev/net/tun ]]; [[ "$(stat /dev/net/tun --format=%u)" == 0 ]]; [[ "$(stat /dev/net/tun --format=%g)" == 0 ]]'
command_not_exists='[[ ! -e /dev/net/tun ]]'
if [[ "$exists" == 0 ]]; then
command="$command_not_exists"
else
command="$command_exists"
fi
systemd-nspawn "$@" bash -xec "$command_exists"
# check if the owner of the host device is unchanged, see issue #34243.
[[ "$(stat /dev/net/tun --format=%u)" == 0 ]]
[[ "$(stat /dev/net/tun --format=%g)" == 0 ]]
# Without DeviceAllow= for /dev/net/tun, see issue #35116.
assert_rc \
"$expect" \
systemd-run --pty --wait -p DevicePolicy=closed -p DeviceAllow="char-pts rw" \
systemd-nspawn "$@" bash -xec "$command"
[[ "$(stat /dev/net/tun --format=%u)" == 0 ]]
[[ "$(stat /dev/net/tun --format=%g)" == 0 ]]
}
testcase_dev_net_tun() {
local root
if [[ ! -c /dev/net/tun ]]; then
echo "/dev/net/tun does not exist, skipping tests"
return 0
fi
root="$(mktemp -d /var/lib/machines/TEST-13-NSPAWN.tun.XXX)"
create_dummy_container "$root"
test_tun 0 1 --ephemeral --directory="$root" --private-users=no
test_tun 0 1 --ephemeral --directory="$root" --private-users=yes
test_tun 0 0 --ephemeral --directory="$root" --private-users=pick
test_tun 0 1 --ephemeral --directory="$root" --private-users=no --private-network
test_tun 0 1 --ephemeral --directory="$root" --private-users=yes --private-network
test_tun 1 0 --ephemeral --directory="$root" --private-users=pick --private-network
rm -fr "$root"
}
run_testcases