mirror of
https://github.com/systemd/systemd.git
synced 2025-01-21 16:03:42 +08:00
nspawn: add --suppress-sync=yes mode for turning sync() and friends into NOPs via seccomp
This is supposed to be used by package/image builders such as mkosi to speed up building, since it allows us to suppress sync() inside a container. This does what Debian's eatmydata tool does, but for a container, and via seccomp (instead of LD_PRELOAD).
This commit is contained in:
parent
231c7645ca
commit
4a4654e024
@ -138,6 +138,12 @@ All tools:
|
||||
* `$SYSTEMD_NSPAWN_TMPFS_TMP=0` — if set, do not overmount `/tmp/` in the
|
||||
container with a tmpfs, but leave the directory from the image in place.
|
||||
|
||||
* `$SYSTEMD_SUPPRESS_SYNC=1` — if set, all disk synchronization syscalls are
|
||||
blocked to the container payload (e.g. `sync()`, `fsync()`, `syncfs()`, …)
|
||||
and the `O_SYNC`/`O_DSYNC` flags are made unavailable to `open()` and
|
||||
friends. This is equivalent to passing `--suppress-sync=yes` on the
|
||||
`systemd-nspawn` command line.
|
||||
|
||||
`systemd-logind`:
|
||||
|
||||
* `$SYSTEMD_BYPASS_HIBERNATION_MEMORY_CHECK=1` — if set, report that
|
||||
|
@ -570,6 +570,24 @@
|
||||
before sending its own to systemd. For more details about notifications
|
||||
see <citerefentry><refentrytitle>sd_notify</refentrytitle><manvolnum>3</manvolnum></citerefentry>.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>--suppress-sync=</option></term>
|
||||
|
||||
<listitem><para>Expects a boolean argument. If true, turns off any form of on-disk file system
|
||||
synchronization for the container payload. This means all system calls such as <citerefentry
|
||||
project='man-pages'><refentrytitle>sync</refentrytitle><manvolnum>2</manvolnum></citerefentry>,
|
||||
<function>fsync()</function>, <function>syncfs()</function>, … will execute no operation, and the
|
||||
<constant>O_SYNC</constant>/<constant>O_DSYNC</constant> flags to <citerefentry
|
||||
project='man-pages'><refentrytitle>open</refentrytitle><manvolnum>2</manvolnum></citerefentry> and
|
||||
related calls will be made unavailable. This is potentially dangerous, as assumed data integrity
|
||||
guarantees to the container payload are not actually enforced (i.e. data assumed to have been written
|
||||
to disk might be lost if the system is shut down abnormally). However, this can dramatically improve
|
||||
container runtime performance – as long as these guarantees are not required or desirable, for
|
||||
example because any data written by the container is of temporary, redundant nature, or just an
|
||||
intermediary artifact that will be further processed and finalized by a later step in a
|
||||
pipeline. Defaults to false.</para></listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
|
||||
</refsect2><refsect2>
|
||||
|
@ -365,6 +365,16 @@
|
||||
details.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><varname>SuppressSync=</varname></term>
|
||||
|
||||
<listitem><para>Configures whether to suppress disk synchronization for the container payload. This
|
||||
is equivalent to the <option>--suppress-sync=</option> command line switch, and takes the same
|
||||
parameter. See
|
||||
<citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry>
|
||||
for details.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
|
@ -63,7 +63,7 @@ _systemd_nspawn() {
|
||||
|
||||
local -A OPTS=(
|
||||
[STANDALONE]='-h --help --version --private-network -b --boot --read-only -q --quiet --share-system
|
||||
--keep-unit -n --network-veth -j -x --ephemeral -a --as-pid2 -U'
|
||||
--keep-unit -n --network-veth -j -x --ephemeral -a --as-pid2 -U --suppress-sync=yes'
|
||||
[ARG]='-D --directory -u --user --uuid --capability --drop-capability --link-journal --bind --bind-ro
|
||||
-M --machine -S --slice -E --setenv -Z --selinux-context -L --selinux-apifs-context
|
||||
--register --network-interface --network-bridge --personality -i --image --tmpfs
|
||||
|
@ -59,6 +59,7 @@ Exec.CPUAffinity, config_parse_cpu_affinity, 0, 0
|
||||
Exec.ResolvConf, config_parse_resolv_conf, 0, offsetof(Settings, resolv_conf)
|
||||
Exec.LinkJournal, config_parse_link_journal, 0, 0
|
||||
Exec.Timezone, config_parse_timezone, 0, offsetof(Settings, timezone)
|
||||
Exec.SuppressSync, config_parse_bool, 0, offsetof(Settings, suppress_sync)
|
||||
Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only)
|
||||
Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode)
|
||||
Files.Bind, config_parse_bind, 0, 0
|
||||
|
@ -127,9 +127,10 @@ typedef enum SettingsMask {
|
||||
SETTING_CONSOLE_MODE = UINT64_C(1) << 29,
|
||||
SETTING_CREDENTIALS = UINT64_C(1) << 30,
|
||||
SETTING_BIND_USER = UINT64_C(1) << 31,
|
||||
SETTING_RLIMIT_FIRST = UINT64_C(1) << 32, /* we define one bit per resource limit here */
|
||||
SETTING_RLIMIT_LAST = UINT64_C(1) << (32 + _RLIMIT_MAX - 1),
|
||||
_SETTINGS_MASK_ALL = (UINT64_C(1) << (32 + _RLIMIT_MAX)) -1,
|
||||
SETTING_SUPPRESS_SYNC = UINT64_C(1) << 32,
|
||||
SETTING_RLIMIT_FIRST = UINT64_C(1) << 33, /* we define one bit per resource limit here */
|
||||
SETTING_RLIMIT_LAST = UINT64_C(1) << (33 + _RLIMIT_MAX - 1),
|
||||
_SETTINGS_MASK_ALL = (UINT64_C(1) << (33 + _RLIMIT_MAX)) -1,
|
||||
_SETTING_FORCE_ENUM_WIDTH = UINT64_MAX
|
||||
} SettingsMask;
|
||||
|
||||
@ -189,6 +190,7 @@ typedef struct Settings {
|
||||
LinkJournal link_journal;
|
||||
bool link_journal_try;
|
||||
TimezoneMode timezone;
|
||||
bool suppress_sync;
|
||||
|
||||
/* [Files] */
|
||||
int read_only;
|
||||
|
@ -229,6 +229,7 @@ static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
|
||||
static Credential *arg_credentials = NULL;
|
||||
static size_t arg_n_credentials = 0;
|
||||
static char **arg_bind_user = NULL;
|
||||
static bool arg_suppress_sync = false;
|
||||
|
||||
STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
|
||||
STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
|
||||
@ -342,7 +343,9 @@ static int help(void) {
|
||||
" -E --setenv=NAME[=VALUE] Pass an environment variable to PID 1\n"
|
||||
" -u --user=USER Run the command under specified user or UID\n"
|
||||
" --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
|
||||
" --notify-ready=BOOLEAN Receive notifications from the child init process\n\n"
|
||||
" --notify-ready=BOOLEAN Receive notifications from the child init process\n"
|
||||
" --suppress-sync=BOOLEAN\n"
|
||||
" Suppress any form of disk data synchronization\n\n"
|
||||
"%3$sSystem Identity:%4$s\n"
|
||||
" -M --machine=NAME Set the machine name for the container\n"
|
||||
" --hostname=NAME Override the hostname for the container\n"
|
||||
@ -654,6 +657,12 @@ static int parse_environment(void) {
|
||||
if (e)
|
||||
arg_container_service_name = e;
|
||||
|
||||
r = getenv_bool("SYSTEMD_SUPPRESS_SYNC");
|
||||
if (r >= 0)
|
||||
arg_suppress_sync = r;
|
||||
else if (r != -ENXIO)
|
||||
log_debug_errno(r, "Failed to parse $SYSTEMD_SUPPRESS_SYNC, ignoring: %m");
|
||||
|
||||
return detect_unified_cgroup_hierarchy_from_environment();
|
||||
}
|
||||
|
||||
@ -713,6 +722,7 @@ static int parse_argv(int argc, char *argv[]) {
|
||||
ARG_SET_CREDENTIAL,
|
||||
ARG_LOAD_CREDENTIAL,
|
||||
ARG_BIND_USER,
|
||||
ARG_SUPPRESS_SYNC,
|
||||
};
|
||||
|
||||
static const struct option options[] = {
|
||||
@ -785,6 +795,7 @@ static int parse_argv(int argc, char *argv[]) {
|
||||
{ "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
|
||||
{ "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
|
||||
{ "bind-user", required_argument, NULL, ARG_BIND_USER },
|
||||
{ "suppress-sync", required_argument, NULL, ARG_SUPPRESS_SYNC },
|
||||
{}
|
||||
};
|
||||
|
||||
@ -1668,6 +1679,14 @@ static int parse_argv(int argc, char *argv[]) {
|
||||
arg_settings_mask |= SETTING_BIND_USER;
|
||||
break;
|
||||
|
||||
case ARG_SUPPRESS_SYNC:
|
||||
r = parse_boolean_argument("--suppress-sync=", optarg, &arg_suppress_sync);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
arg_settings_mask |= SETTING_SUPPRESS_SYNC;
|
||||
break;
|
||||
|
||||
case '?':
|
||||
return -EINVAL;
|
||||
|
||||
@ -3385,6 +3404,12 @@ static int inner_child(
|
||||
return r;
|
||||
}
|
||||
|
||||
if (arg_suppress_sync) {
|
||||
r = seccomp_suppress_sync();
|
||||
if (r < 0)
|
||||
log_debug_errno(r, "Failed to install sync() suppression seccomp filter, ignoring: %m");
|
||||
}
|
||||
|
||||
#if HAVE_SELINUX
|
||||
if (arg_selinux_context)
|
||||
if (setexeccon(arg_selinux_context) < 0)
|
||||
@ -4552,6 +4577,9 @@ static int merge_settings(Settings *settings, const char *path) {
|
||||
arg_console_mode = settings->console_mode;
|
||||
}
|
||||
|
||||
if ((arg_settings_mask & SETTING_SUPPRESS_SYNC) == 0)
|
||||
arg_suppress_sync = settings->suppress_sync;
|
||||
|
||||
/* The following properties can only be set through the OCI settings logic, not from the command line, hence we
|
||||
* don't consult arg_settings_mask for them. */
|
||||
|
||||
|
@ -2205,3 +2205,98 @@ int parse_syscall_and_errno(const char *in, char **name, int *error) {
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int block_open_flag(scmp_filter_ctx seccomp, int flag) {
|
||||
bool any = false;
|
||||
int r;
|
||||
|
||||
/* Blocks open() with the specified flag, where flag is O_SYNC or so. This makes these calls return
|
||||
* EINVAL, in the hope the client code will retry without O_SYNC then. */
|
||||
|
||||
#if SCMP_SYS(open) > 0
|
||||
r = seccomp_rule_add_exact(
|
||||
seccomp,
|
||||
SCMP_ACT_ERRNO(EINVAL),
|
||||
SCMP_SYS(open),
|
||||
1,
|
||||
SCMP_A1(SCMP_CMP_MASKED_EQ, flag, flag));
|
||||
if (r < 0)
|
||||
log_debug_errno(r, "Failed to add filter for open: %m");
|
||||
else
|
||||
any = true;
|
||||
#endif
|
||||
|
||||
r = seccomp_rule_add_exact(
|
||||
seccomp,
|
||||
SCMP_ACT_ERRNO(EINVAL),
|
||||
SCMP_SYS(openat),
|
||||
1,
|
||||
SCMP_A2(SCMP_CMP_MASKED_EQ, flag, flag));
|
||||
if (r < 0)
|
||||
log_debug_errno(r, "Failed to add filter for openat: %m");
|
||||
else
|
||||
any = true;
|
||||
|
||||
#if defined(__SNR_openat2)
|
||||
/* The new openat2() system call can't be filtered sensibly, see above. */
|
||||
r = seccomp_rule_add_exact(
|
||||
seccomp,
|
||||
SCMP_ACT_ERRNO(ENOSYS),
|
||||
SCMP_SYS(openat2),
|
||||
0);
|
||||
if (r < 0)
|
||||
log_debug_errno(r, "Failed to add filter for openat2: %m");
|
||||
else
|
||||
any = true;
|
||||
#endif
|
||||
|
||||
return any ? 0 : r;
|
||||
}
|
||||
|
||||
int seccomp_suppress_sync(void) {
|
||||
uint32_t arch;
|
||||
int r;
|
||||
|
||||
/* This is mostly identical to SystemCallFilter=~@sync:0, but simpler to use, and separately
|
||||
* manageable, and also masks O_SYNC/O_DSYNC */
|
||||
|
||||
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
|
||||
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
|
||||
const char *c;
|
||||
|
||||
r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
NULSTR_FOREACH(c, syscall_filter_sets[SYSCALL_FILTER_SET_SYNC].value) {
|
||||
int id;
|
||||
|
||||
id = seccomp_syscall_resolve_name(c);
|
||||
if (id == __NR_SCMP_ERROR) {
|
||||
log_debug("System call %s is not known, ignoring.", c);
|
||||
continue;
|
||||
}
|
||||
|
||||
r = seccomp_rule_add_exact(
|
||||
seccomp,
|
||||
SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */
|
||||
id,
|
||||
0);
|
||||
if (r < 0)
|
||||
log_debug_errno(r, "Failed to add filter for system call %s, ignoring: %m", c);
|
||||
}
|
||||
|
||||
(void) block_open_flag(seccomp, O_SYNC);
|
||||
#if O_DSYNC != O_SYNC
|
||||
(void) block_open_flag(seccomp, O_DSYNC);
|
||||
#endif
|
||||
|
||||
r = seccomp_load(seccomp);
|
||||
if (ERRNO_IS_SECCOMP_FATAL(r))
|
||||
return r;
|
||||
if (r < 0)
|
||||
log_debug_errno(r, "Failed to apply sync() suppression for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -150,3 +150,5 @@ static inline const char *seccomp_errno_or_action_to_string(int num) {
|
||||
}
|
||||
|
||||
int parse_syscall_and_errno(const char *in, char **name, int *error);
|
||||
|
||||
int seccomp_suppress_sync(void);
|
||||
|
Loading…
Reference in New Issue
Block a user