mirror of
https://github.com/systemd/systemd.git
synced 2024-11-28 04:33:36 +08:00
execute: add a new easy-to-use RestrictRealtime= option to units
It takes a boolean value. If true, access to SCHED_RR, SCHED_FIFO and SCHED_DEADLINE is blocked, which my be used to lock up the system.
This commit is contained in:
parent
abd84d4d83
commit
f4170c671b
@ -1413,6 +1413,19 @@
|
||||
</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><varname>RestrictRealtime=</varname></term>
|
||||
|
||||
<listitem><para>Takes a boolean argument. If set, any attempts to enable realtime scheduling in a process of
|
||||
the unit are refused. This restricts access to realtime task scheduling policies such as
|
||||
<constant>SCHED_FIFO</constant>, <constant>SCHED_RR</constant> or <constant>SCHED_DEADLINE</constant>. See
|
||||
<citerefentry><refentrytitle>sched</refentrytitle><manvolnum>7</manvolnum></citerefentry> for details about
|
||||
these scheduling policies. Realtime scheduling policies may be used to monopolize CPU time for longer periods
|
||||
of time, and may hence be used to lock up or otherwise trigger Denial-of-Service situations on the system. It
|
||||
is hence recommended to restrict access to realtime scheduling to the few programs that actually require
|
||||
them. Defaults to off.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
|
@ -720,6 +720,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
|
||||
SD_BUS_PROPERTY("RuntimeDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, runtime_directory_mode), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("RuntimeDirectory", "as", NULL, offsetof(ExecContext, runtime_directory), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("MemoryDenyWriteExecute", "b", bus_property_get_bool, offsetof(ExecContext, memory_deny_write_execute), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("RestrictRealtime", "b", bus_property_get_bool, offsetof(ExecContext, restrict_realtime), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_VTABLE_END
|
||||
};
|
||||
|
||||
@ -1057,7 +1058,7 @@ int bus_exec_context_set_transient_property(
|
||||
} else if (STR_IN_SET(name,
|
||||
"IgnoreSIGPIPE", "TTYVHangup", "TTYReset",
|
||||
"PrivateTmp", "PrivateDevices", "PrivateNetwork",
|
||||
"NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute")) {
|
||||
"NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute", "RestrictRealtime")) {
|
||||
int b;
|
||||
|
||||
r = sd_bus_message_read(message, "b", &b);
|
||||
@ -1083,6 +1084,8 @@ int bus_exec_context_set_transient_property(
|
||||
c->syslog_level_prefix = b;
|
||||
else if (streq(name, "MemoryDenyWriteExecute"))
|
||||
c->memory_deny_write_execute = b;
|
||||
else if (streq(name, "RestrictRealtime"))
|
||||
c->restrict_realtime = b;
|
||||
|
||||
unit_write_drop_in_private_format(u, mode, name, "%s=%s", name, yes_no(b));
|
||||
}
|
||||
|
@ -1264,6 +1264,76 @@ finish:
|
||||
return r;
|
||||
}
|
||||
|
||||
static int apply_restrict_realtime(const ExecContext *c) {
|
||||
static const int permitted_policies[] = {
|
||||
SCHED_OTHER,
|
||||
SCHED_BATCH,
|
||||
SCHED_IDLE,
|
||||
};
|
||||
|
||||
scmp_filter_ctx *seccomp;
|
||||
unsigned i;
|
||||
int r, p, max_policy = 0;
|
||||
|
||||
assert(c);
|
||||
|
||||
seccomp = seccomp_init(SCMP_ACT_ALLOW);
|
||||
if (!seccomp)
|
||||
return -ENOMEM;
|
||||
|
||||
/* Determine the highest policy constant we want to allow */
|
||||
for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
|
||||
if (permitted_policies[i] > max_policy)
|
||||
max_policy = permitted_policies[i];
|
||||
|
||||
/* Go through all policies with lower values than that, and block them -- unless they appear in the
|
||||
* whitelist. */
|
||||
for (p = 0; p < max_policy; p++) {
|
||||
bool good = false;
|
||||
|
||||
/* Check if this is in the whitelist. */
|
||||
for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
|
||||
if (permitted_policies[i] == p) {
|
||||
good = true;
|
||||
break;
|
||||
}
|
||||
|
||||
if (good)
|
||||
continue;
|
||||
|
||||
/* Deny this policy */
|
||||
r = seccomp_rule_add(
|
||||
seccomp,
|
||||
SCMP_ACT_ERRNO(EPERM),
|
||||
SCMP_SYS(sched_setscheduler),
|
||||
1,
|
||||
SCMP_A1(SCMP_CMP_EQ, p));
|
||||
if (r < 0)
|
||||
goto finish;
|
||||
}
|
||||
|
||||
/* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are unsigned here,
|
||||
* hence no need no check for < 0 values. */
|
||||
r = seccomp_rule_add(
|
||||
seccomp,
|
||||
SCMP_ACT_ERRNO(EPERM),
|
||||
SCMP_SYS(sched_setscheduler),
|
||||
1,
|
||||
SCMP_A1(SCMP_CMP_GT, max_policy));
|
||||
if (r < 0)
|
||||
goto finish;
|
||||
|
||||
r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
|
||||
if (r < 0)
|
||||
goto finish;
|
||||
|
||||
r = seccomp_load(seccomp);
|
||||
|
||||
finish:
|
||||
seccomp_release(seccomp);
|
||||
return r;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static void do_idle_pipe_dance(int idle_pipe[4]) {
|
||||
@ -1962,6 +2032,14 @@ static int exec_child(
|
||||
}
|
||||
}
|
||||
|
||||
/* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
|
||||
if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
|
||||
if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
|
||||
*exit_status = EXIT_LIMITS;
|
||||
return -errno;
|
||||
}
|
||||
}
|
||||
|
||||
if (!cap_test_all(context->capability_bounding_set)) {
|
||||
r = capability_bounding_set_drop(context->capability_bounding_set, false);
|
||||
if (r < 0) {
|
||||
@ -2017,7 +2095,7 @@ static int exec_child(
|
||||
}
|
||||
|
||||
if (context->no_new_privileges ||
|
||||
(!have_effective_cap(CAP_SYS_ADMIN) && (use_address_families || context->memory_deny_write_execute || use_syscall_filter)))
|
||||
(!have_effective_cap(CAP_SYS_ADMIN) && (use_address_families || context->memory_deny_write_execute || context->restrict_realtime || use_syscall_filter)))
|
||||
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
|
||||
*exit_status = EXIT_NO_NEW_PRIVILEGES;
|
||||
return -errno;
|
||||
@ -2039,6 +2117,15 @@ static int exec_child(
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
||||
if (context->restrict_realtime) {
|
||||
r = apply_restrict_realtime(context);
|
||||
if (r < 0) {
|
||||
*exit_status = EXIT_SECCOMP;
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
||||
if (use_syscall_filter) {
|
||||
r = apply_seccomp(context);
|
||||
if (r < 0) {
|
||||
@ -2474,7 +2561,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
|
||||
"%sProtectHome: %s\n"
|
||||
"%sProtectSystem: %s\n"
|
||||
"%sIgnoreSIGPIPE: %s\n"
|
||||
"%sMemoryDenyWriteExecute: %s\n",
|
||||
"%sMemoryDenyWriteExecute: %s\n"
|
||||
"%sRestrictRealtime: %s\n",
|
||||
prefix, c->umask,
|
||||
prefix, c->working_directory ? c->working_directory : "/",
|
||||
prefix, c->root_directory ? c->root_directory : "/",
|
||||
@ -2485,7 +2573,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
|
||||
prefix, protect_home_to_string(c->protect_home),
|
||||
prefix, protect_system_to_string(c->protect_system),
|
||||
prefix, yes_no(c->ignore_sigpipe),
|
||||
prefix, yes_no(c->memory_deny_write_execute));
|
||||
prefix, yes_no(c->memory_deny_write_execute),
|
||||
prefix, yes_no(c->restrict_realtime));
|
||||
|
||||
STRV_FOREACH(e, c->environment)
|
||||
fprintf(f, "%sEnvironment: %s\n", prefix, *e);
|
||||
|
@ -193,12 +193,14 @@ struct ExecContext {
|
||||
char **runtime_directory;
|
||||
mode_t runtime_directory_mode;
|
||||
|
||||
bool memory_deny_write_execute;
|
||||
bool restrict_realtime;
|
||||
|
||||
bool oom_score_adjust_set:1;
|
||||
bool nice_set:1;
|
||||
bool ioprio_set:1;
|
||||
bool cpu_sched_set:1;
|
||||
bool no_new_privileges_set:1;
|
||||
bool memory_deny_write_execute;
|
||||
};
|
||||
|
||||
#include "cgroup-util.h"
|
||||
|
@ -56,11 +56,13 @@ m4_ifdef(`HAVE_SECCOMP',
|
||||
$1.SystemCallArchitectures, config_parse_syscall_archs, 0, offsetof($1, exec_context.syscall_archs)
|
||||
$1.SystemCallErrorNumber, config_parse_syscall_errno, 0, offsetof($1, exec_context)
|
||||
$1.MemoryDenyWriteExecute, config_parse_bool, 0, offsetof($1, exec_context.memory_deny_write_execute)
|
||||
$1.RestrictRealtime, config_parse_bool, 0, offsetof($1, exec_context.restrict_realtime)
|
||||
$1.RestrictAddressFamilies, config_parse_address_families, 0, offsetof($1, exec_context)',
|
||||
`$1.SystemCallFilter, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
|
||||
$1.SystemCallArchitectures, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
|
||||
$1.SystemCallErrorNumber, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
|
||||
$1.MemoryDenyWriteExecute, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
|
||||
$1.RestrictRealtime, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
|
||||
$1.RestrictAddressFamilies, config_parse_warn_compat, DISABLED_CONFIGURATION, 0')
|
||||
$1.LimitCPU, config_parse_limit, RLIMIT_CPU, offsetof($1, exec_context.rlimit)
|
||||
$1.LimitFSIZE, config_parse_limit, RLIMIT_FSIZE, offsetof($1, exec_context.rlimit)
|
||||
|
Loading…
Reference in New Issue
Block a user