execute: add a new easy-to-use RestrictRealtime= option to units

It takes a boolean value. If true, access to SCHED_RR, SCHED_FIFO and
SCHED_DEADLINE is blocked, which my be used to lock up the system.
This commit is contained in:
Lennart Poettering 2016-06-23 01:45:45 +02:00
parent abd84d4d83
commit f4170c671b
5 changed files with 114 additions and 5 deletions

View File

@ -1413,6 +1413,19 @@
</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>RestrictRealtime=</varname></term>
<listitem><para>Takes a boolean argument. If set, any attempts to enable realtime scheduling in a process of
the unit are refused. This restricts access to realtime task scheduling policies such as
<constant>SCHED_FIFO</constant>, <constant>SCHED_RR</constant> or <constant>SCHED_DEADLINE</constant>. See
<citerefentry><refentrytitle>sched</refentrytitle><manvolnum>7</manvolnum></citerefentry> for details about
these scheduling policies. Realtime scheduling policies may be used to monopolize CPU time for longer periods
of time, and may hence be used to lock up or otherwise trigger Denial-of-Service situations on the system. It
is hence recommended to restrict access to realtime scheduling to the few programs that actually require
them. Defaults to off.</para></listitem>
</varlistentry>
</variablelist>
</refsect1>

View File

@ -720,6 +720,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_PROPERTY("RuntimeDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, runtime_directory_mode), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RuntimeDirectory", "as", NULL, offsetof(ExecContext, runtime_directory), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("MemoryDenyWriteExecute", "b", bus_property_get_bool, offsetof(ExecContext, memory_deny_write_execute), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RestrictRealtime", "b", bus_property_get_bool, offsetof(ExecContext, restrict_realtime), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_VTABLE_END
};
@ -1057,7 +1058,7 @@ int bus_exec_context_set_transient_property(
} else if (STR_IN_SET(name,
"IgnoreSIGPIPE", "TTYVHangup", "TTYReset",
"PrivateTmp", "PrivateDevices", "PrivateNetwork",
"NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute")) {
"NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute", "RestrictRealtime")) {
int b;
r = sd_bus_message_read(message, "b", &b);
@ -1083,6 +1084,8 @@ int bus_exec_context_set_transient_property(
c->syslog_level_prefix = b;
else if (streq(name, "MemoryDenyWriteExecute"))
c->memory_deny_write_execute = b;
else if (streq(name, "RestrictRealtime"))
c->restrict_realtime = b;
unit_write_drop_in_private_format(u, mode, name, "%s=%s", name, yes_no(b));
}

View File

@ -1264,6 +1264,76 @@ finish:
return r;
}
static int apply_restrict_realtime(const ExecContext *c) {
static const int permitted_policies[] = {
SCHED_OTHER,
SCHED_BATCH,
SCHED_IDLE,
};
scmp_filter_ctx *seccomp;
unsigned i;
int r, p, max_policy = 0;
assert(c);
seccomp = seccomp_init(SCMP_ACT_ALLOW);
if (!seccomp)
return -ENOMEM;
/* Determine the highest policy constant we want to allow */
for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
if (permitted_policies[i] > max_policy)
max_policy = permitted_policies[i];
/* Go through all policies with lower values than that, and block them -- unless they appear in the
* whitelist. */
for (p = 0; p < max_policy; p++) {
bool good = false;
/* Check if this is in the whitelist. */
for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
if (permitted_policies[i] == p) {
good = true;
break;
}
if (good)
continue;
/* Deny this policy */
r = seccomp_rule_add(
seccomp,
SCMP_ACT_ERRNO(EPERM),
SCMP_SYS(sched_setscheduler),
1,
SCMP_A1(SCMP_CMP_EQ, p));
if (r < 0)
goto finish;
}
/* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are unsigned here,
* hence no need no check for < 0 values. */
r = seccomp_rule_add(
seccomp,
SCMP_ACT_ERRNO(EPERM),
SCMP_SYS(sched_setscheduler),
1,
SCMP_A1(SCMP_CMP_GT, max_policy));
if (r < 0)
goto finish;
r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
if (r < 0)
goto finish;
r = seccomp_load(seccomp);
finish:
seccomp_release(seccomp);
return r;
}
#endif
static void do_idle_pipe_dance(int idle_pipe[4]) {
@ -1962,6 +2032,14 @@ static int exec_child(
}
}
/* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
*exit_status = EXIT_LIMITS;
return -errno;
}
}
if (!cap_test_all(context->capability_bounding_set)) {
r = capability_bounding_set_drop(context->capability_bounding_set, false);
if (r < 0) {
@ -2017,7 +2095,7 @@ static int exec_child(
}
if (context->no_new_privileges ||
(!have_effective_cap(CAP_SYS_ADMIN) && (use_address_families || context->memory_deny_write_execute || use_syscall_filter)))
(!have_effective_cap(CAP_SYS_ADMIN) && (use_address_families || context->memory_deny_write_execute || context->restrict_realtime || use_syscall_filter)))
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
*exit_status = EXIT_NO_NEW_PRIVILEGES;
return -errno;
@ -2039,6 +2117,15 @@ static int exec_child(
return r;
}
}
if (context->restrict_realtime) {
r = apply_restrict_realtime(context);
if (r < 0) {
*exit_status = EXIT_SECCOMP;
return r;
}
}
if (use_syscall_filter) {
r = apply_seccomp(context);
if (r < 0) {
@ -2474,7 +2561,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
"%sProtectHome: %s\n"
"%sProtectSystem: %s\n"
"%sIgnoreSIGPIPE: %s\n"
"%sMemoryDenyWriteExecute: %s\n",
"%sMemoryDenyWriteExecute: %s\n"
"%sRestrictRealtime: %s\n",
prefix, c->umask,
prefix, c->working_directory ? c->working_directory : "/",
prefix, c->root_directory ? c->root_directory : "/",
@ -2485,7 +2573,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
prefix, protect_home_to_string(c->protect_home),
prefix, protect_system_to_string(c->protect_system),
prefix, yes_no(c->ignore_sigpipe),
prefix, yes_no(c->memory_deny_write_execute));
prefix, yes_no(c->memory_deny_write_execute),
prefix, yes_no(c->restrict_realtime));
STRV_FOREACH(e, c->environment)
fprintf(f, "%sEnvironment: %s\n", prefix, *e);

View File

@ -193,12 +193,14 @@ struct ExecContext {
char **runtime_directory;
mode_t runtime_directory_mode;
bool memory_deny_write_execute;
bool restrict_realtime;
bool oom_score_adjust_set:1;
bool nice_set:1;
bool ioprio_set:1;
bool cpu_sched_set:1;
bool no_new_privileges_set:1;
bool memory_deny_write_execute;
};
#include "cgroup-util.h"

View File

@ -56,11 +56,13 @@ m4_ifdef(`HAVE_SECCOMP',
$1.SystemCallArchitectures, config_parse_syscall_archs, 0, offsetof($1, exec_context.syscall_archs)
$1.SystemCallErrorNumber, config_parse_syscall_errno, 0, offsetof($1, exec_context)
$1.MemoryDenyWriteExecute, config_parse_bool, 0, offsetof($1, exec_context.memory_deny_write_execute)
$1.RestrictRealtime, config_parse_bool, 0, offsetof($1, exec_context.restrict_realtime)
$1.RestrictAddressFamilies, config_parse_address_families, 0, offsetof($1, exec_context)',
`$1.SystemCallFilter, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
$1.SystemCallArchitectures, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
$1.SystemCallErrorNumber, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
$1.MemoryDenyWriteExecute, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
$1.RestrictRealtime, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
$1.RestrictAddressFamilies, config_parse_warn_compat, DISABLED_CONFIGURATION, 0')
$1.LimitCPU, config_parse_limit, RLIMIT_CPU, offsetof($1, exec_context.rlimit)
$1.LimitFSIZE, config_parse_limit, RLIMIT_FSIZE, offsetof($1, exec_context.rlimit)