cgroup: Add ManagedOOMMemoryPressureDurationSec= override setting for units

This will allow units (scopes/slices/services) to override the default
systemd-oomd setting DefaultMemoryPressureDurationSec=.

The semantics of ManagedOOMMemoryPressureDurationSec= are:
- If >= 1 second, overrides DefaultMemoryPressureDurationSec= from oomd.conf
- If is empty, uses DefaultMemoryPressureDurationSec= from oomd.conf
- Ignored if ManagedOOMMemoryPressure= is not "kill"
- Disallowed if < 1 second

Note the corresponding dbus property is DefaultMemoryPressureDurationUSec
which is in microseconds. This is consistent with other time-based
dbus properties.
This commit is contained in:
Ryan Wilson 2024-10-14 20:49:54 -07:00
parent c43ef2e883
commit 63d4c4271c
21 changed files with 293 additions and 21 deletions

View File

@ -281,6 +281,7 @@ All cgroup/resource control settings are available for transient units
✓ ManagedOOMSwap=
✓ ManagedOOMMemoryPressure=
✓ ManagedOOMMemoryPressureLimit=
✓ ManagedOOMMemoryPressureDurationSec=
✓ ManagedOOMPreference=
✓ CoredumpReceive=
```

View File

@ -90,7 +90,8 @@
<term><varname>DefaultMemoryPressureDurationSec=</varname></term>
<listitem><para>Sets the amount of time a unit's control group needs to have exceeded memory pressure
limits before <command>systemd-oomd</command> will take action. Memory pressure limits are defined by
limits before <command>systemd-oomd</command> will take action. A unit can override this value with
<varname>ManagedOOMMemoryPressureDurationSec=</varname>. Memory pressure limits are defined by
<varname>DefaultMemoryPressureLimit=</varname> and <varname>ManagedOOMMemoryPressureLimit=</varname>.
Must be set to 0, or at least 1 second. Defaults to 30 seconds when unset or 0.</para>

View File

@ -2993,6 +2993,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly u ManagedOOMMemoryPressureLimit = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t ManagedOOMMemoryPressureDurationUSec = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly s ManagedOOMPreference = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly a(ss) BPFProgram = [...];
@ -4312,6 +4314,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMMemoryPressureLimit"/>
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMMemoryPressureDurationUSec"/>
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMPreference"/>
<variablelist class="dbus-property" generated="True" extra-ref="BPFProgram"/>
@ -4849,6 +4853,11 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
method. See <citerefentry><refentrytitle>sd_listen_fds</refentrytitle><manvolnum>3</manvolnum></citerefentry>
for more details on how to retrieve these file descriptors. Unlike the <varname>ExtraFileDescriptors</varname>
input property, <varname>ExtraFileDescriptorNames</varname> only contains names and not the file descriptors.</para>
<para><varname>ManagedOOMMemoryPressureDurationUSec</varname> implement the destination parameter of the
unit file setting <varname>ManagedOOMMemoryPressureDurationSec=</varname> listed in
<citerefentry><refentrytitle>systemd.resource-control</refentrytitle><manvolnum>5</manvolnum></citerefentry>.
Note the time unit is expressed in <literal>μs</literal>.</para>
</refsect2>
</refsect1>
@ -5148,6 +5157,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly u ManagedOOMMemoryPressureLimit = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t ManagedOOMMemoryPressureDurationUSec = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly s ManagedOOMPreference = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly a(ss) BPFProgram = [...];
@ -6451,6 +6462,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMMemoryPressureLimit"/>
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMMemoryPressureDurationUSec"/>
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMPreference"/>
<variablelist class="dbus-property" generated="True" extra-ref="BPFProgram"/>
@ -7145,6 +7158,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly u ManagedOOMMemoryPressureLimit = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t ManagedOOMMemoryPressureDurationUSec = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly s ManagedOOMPreference = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly a(ss) BPFProgram = [...];
@ -8286,6 +8301,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMMemoryPressureLimit"/>
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMMemoryPressureDurationUSec"/>
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMPreference"/>
<variablelist class="dbus-property" generated="True" extra-ref="BPFProgram"/>
@ -9109,6 +9126,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly u ManagedOOMMemoryPressureLimit = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t ManagedOOMMemoryPressureDurationUSec = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly s ManagedOOMPreference = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly a(ss) BPFProgram = [...];
@ -10222,6 +10241,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMMemoryPressureLimit"/>
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMMemoryPressureDurationUSec"/>
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMPreference"/>
<variablelist class="dbus-property" generated="True" extra-ref="BPFProgram"/>
@ -10898,6 +10919,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly u ManagedOOMMemoryPressureLimit = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t ManagedOOMMemoryPressureDurationUSec = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly s ManagedOOMPreference = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly a(ss) BPFProgram = [...];
@ -11285,6 +11308,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMMemoryPressureLimit"/>
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMMemoryPressureDurationUSec"/>
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMPreference"/>
<variablelist class="dbus-property" generated="True" extra-ref="BPFProgram"/>
@ -11309,6 +11334,11 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
<title>Properties</title>
<para>Most properties correspond directly with the matching settings in slice unit files.</para>
<para><varname>ManagedOOMMemoryPressureDurationUSec</varname> implement the destination parameter of the
unit file setting <varname>ManagedOOMMemoryPressureDurationSec=</varname> listed in
<citerefentry><refentrytitle>systemd.resource-control</refentrytitle><manvolnum>5</manvolnum></citerefentry>.
Note the time unit is expressed in <literal>μs</literal>.</para>
</refsect2>
</refsect1>
@ -11507,6 +11537,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly u ManagedOOMMemoryPressureLimit = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t ManagedOOMMemoryPressureDurationUSec = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly s ManagedOOMPreference = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly a(ss) BPFProgram = [...];
@ -11944,6 +11976,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMMemoryPressureLimit"/>
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMMemoryPressureDurationUSec"/>
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMPreference"/>
<variablelist class="dbus-property" generated="True" extra-ref="BPFProgram"/>
@ -12004,6 +12038,11 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
the scope unit is to be shut down via a <function>RequestStop()</function> signal (see below). This is
set when the scope is created. If not set, the scope's processes will terminated with
<constant>SIGTERM</constant> directly.</para>
<para><varname>ManagedOOMMemoryPressureDurationUSec</varname> implement the destination parameter of the
unit file setting <varname>ManagedOOMMemoryPressureDurationSec=</varname> listed in
<citerefentry><refentrytitle>systemd.resource-control</refentrytitle><manvolnum>5</manvolnum></citerefentry>.
Note the time unit is expressed in <literal>μs</literal>.</para>
</refsect2>
</refsect1>
@ -12222,6 +12261,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>PrivateTmpEx</varname>,
<varname>ImportCredentialEx</varname>,
<varname>ExtraFileDescriptorNames</varname>,
<varname>ManagedOOMMemoryPressureDurationUSec</varname>,
<varname>BindLogSockets</varname>, and
<varname>PrivateUsersEx</varname> were added in version 257.</para>
</refsect2>
@ -12362,6 +12402,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>EffectiveMemoryMax</varname>,
<varname>EffectiveTasksMax</varname>, and
<varname>MemoryZSwapWriteback</varname> were added in version 256.</para>
<para><varname>ManagedOOMMemoryPressureDurationUSec</varname> was added in version 257.</para>
</refsect2>
<refsect2>
<title>Scope Unit Objects</title>
@ -12387,6 +12428,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>EffectiveMemoryMax</varname>,
<varname>EffectiveTasksMax</varname>, and
<varname>MemoryZSwapWriteback</varname> were added in version 256.</para>
<para><varname>ManagedOOMMemoryPressureDurationUSec</varname> was added in version 257.</para>
</refsect2>
<refsect2>
<title>Job Objects</title>

View File

@ -1535,16 +1535,35 @@ DeviceAllow=/dev/loop-control
<listitem>
<para>Overrides the default memory pressure limit set by
<citerefentry><refentrytitle>oomd.conf</refentrytitle><manvolnum>5</manvolnum></citerefentry> for
this unit (cgroup). Takes a percentage value between 0% and 100%, inclusive. This property is
ignored unless <varname>ManagedOOMMemoryPressure=</varname><option>kill</option>. Defaults to 0%,
the cgroup of this unit. Takes a percentage value between 0% and 100%, inclusive. Defaults to 0%,
which means to use the default set by
<citerefentry><refentrytitle>oomd.conf</refentrytitle><manvolnum>5</manvolnum></citerefentry>.
This property is ignored unless <varname>ManagedOOMMemoryPressure=</varname><option>kill</option>.
</para>
<xi:include href="version-info.xml" xpointer="v247"/>
</listitem>
</varlistentry>
<varlistentry>
<term><varname>ManagedOOMMemoryPressureDurationSec=</varname></term>
<listitem>
<para>Overrides the default memory pressure duration set by
<citerefentry><refentrytitle>oomd.conf</refentrytitle><manvolnum>5</manvolnum></citerefentry> for
the cgroup of this unit. The specified value supports a time unit such as <literal>ms</literal> or
<literal>μs</literal>, see
<citerefentry><refentrytitle>systemd.time</refentrytitle><manvolnum>7</manvolnum></citerefentry>
for details on the permitted syntax. Must be set to either empty or a value of at least 1s. Defaults
to empty, which means to use the default set by
<citerefentry><refentrytitle>oomd.conf</refentrytitle><manvolnum>5</manvolnum></citerefentry>.
This property is ignored unless <varname>ManagedOOMMemoryPressure=</varname><option>kill</option>.
</para>
<xi:include href="version-info.xml" xpointer="v257"/>
</listitem>
</varlistentry>
<varlistentry>
<term><varname>ManagedOOMPreference=none|avoid|omit</varname></term>

View File

@ -194,6 +194,9 @@ void cgroup_context_init(CGroupContext *c) {
.moom_swap = MANAGED_OOM_AUTO,
.moom_mem_pressure = MANAGED_OOM_AUTO,
.moom_preference = MANAGED_OOM_PREFERENCE_NONE,
/* The default duration value in oomd.conf will be used when
* moom_mem_pressure_duration_usec is set to infinity. */
.moom_mem_pressure_duration_usec = USEC_INFINITY,
.memory_pressure_watch = _CGROUP_PRESSURE_WATCH_INVALID,
.memory_pressure_threshold_usec = USEC_INFINITY,
@ -947,6 +950,10 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
fprintf(f, "%sMemoryPressureThresholdSec: %s\n",
prefix, FORMAT_TIMESPAN(c->memory_pressure_threshold_usec, 1));
if (c->moom_mem_pressure_duration_usec != USEC_INFINITY)
fprintf(f, "%sManagedOOMMemoryPressureDurationSec: %s\n",
prefix, FORMAT_TIMESPAN(c->moom_mem_pressure_duration_usec, 1));
LIST_FOREACH(device_allow, a, c->device_allow)
/* strna() below should be redundant, for avoiding -Werror=format-overflow= error. See #30223. */
fprintf(f,

View File

@ -236,6 +236,7 @@ struct CGroupContext {
ManagedOOMMode moom_swap;
ManagedOOMMode moom_mem_pressure;
uint32_t moom_mem_pressure_limit; /* Normalized to 2^32-1 == 100% */
usec_t moom_mem_pressure_duration_usec;
ManagedOOMPreference moom_preference;
/* Memory pressure logic */

View File

@ -57,7 +57,7 @@ static bool user_match_lookup_parameters(LookupParameters *p, const char *name,
}
static int build_managed_oom_json_array_element(Unit *u, const char *property, sd_json_variant **ret_v) {
bool use_limit = false;
bool use_limit = false, use_duration = false;
CGroupContext *c;
const char *mode;
@ -84,7 +84,8 @@ static int build_managed_oom_json_array_element(Unit *u, const char *property, s
mode = managed_oom_mode_to_string(c->moom_swap);
else if (streq(property, "ManagedOOMMemoryPressure")) {
mode = managed_oom_mode_to_string(c->moom_mem_pressure);
use_limit = true;
use_limit = c->moom_mem_pressure_limit > 0;
use_duration = c->moom_mem_pressure_duration_usec != USEC_INFINITY;
} else
return -EINVAL;
@ -92,7 +93,8 @@ static int build_managed_oom_json_array_element(Unit *u, const char *property, s
SD_JSON_BUILD_PAIR("mode", SD_JSON_BUILD_STRING(mode)),
SD_JSON_BUILD_PAIR("path", SD_JSON_BUILD_STRING(crt->cgroup_path)),
SD_JSON_BUILD_PAIR("property", SD_JSON_BUILD_STRING(property)),
SD_JSON_BUILD_PAIR_CONDITION(use_limit, "limit", SD_JSON_BUILD_UNSIGNED(c->moom_mem_pressure_limit)));
SD_JSON_BUILD_PAIR_CONDITION(use_limit, "limit", SD_JSON_BUILD_UNSIGNED(c->moom_mem_pressure_limit)),
SD_JSON_BUILD_PAIR_CONDITION(use_duration, "duration", SD_JSON_BUILD_UNSIGNED(c->moom_mem_pressure_duration_usec)));
}
static int build_managed_oom_cgroups_json(Manager *m, sd_json_variant **ret) {

View File

@ -502,6 +502,7 @@ const sd_bus_vtable bus_cgroup_vtable[] = {
SD_BUS_PROPERTY("ManagedOOMSwap", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_swap), 0),
SD_BUS_PROPERTY("ManagedOOMMemoryPressure", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_mem_pressure), 0),
SD_BUS_PROPERTY("ManagedOOMMemoryPressureLimit", "u", NULL, offsetof(CGroupContext, moom_mem_pressure_limit), 0),
SD_BUS_PROPERTY("ManagedOOMMemoryPressureDurationUSec", "t", bus_property_get_usec, offsetof(CGroupContext, moom_mem_pressure_duration_usec), 0),
SD_BUS_PROPERTY("ManagedOOMPreference", "s", property_get_managed_oom_preference, offsetof(CGroupContext, moom_preference), 0),
SD_BUS_PROPERTY("BPFProgram", "a(ss)", property_get_bpf_foreign_program, 0, 0),
SD_BUS_PROPERTY("SocketBindAllow", "a(iiqq)", property_get_socket_bind, offsetof(CGroupContext, socket_bind_allow), 0),
@ -2053,6 +2054,36 @@ int bus_cgroup_set_property(
return 1;
}
if (streq(name, "ManagedOOMMemoryPressureDurationUSec")) {
uint64_t t;
if (!UNIT_VTABLE(u)->can_set_managed_oom)
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set %s for this unit type", name);
r = sd_bus_message_read(message, "t", &t);
if (r < 0)
return r;
if (t < 1 * USEC_PER_SEC)
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "%s= must be at least 1s, got %s", name,
FORMAT_TIMESPAN(t, USEC_PER_SEC));
if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
c->memory_pressure_threshold_usec = t;
if (c->memory_pressure_threshold_usec == USEC_INFINITY)
unit_write_setting(u, flags, name, "ManagedOOMMemoryPressureDurationSec=");
else
unit_write_settingf(u, flags, name,
"ManagedOOMMemoryPressureDurationSec=%s",
FORMAT_TIMESPAN(c->memory_pressure_threshold_usec, 1));
}
if (c->moom_mem_pressure == MANAGED_OOM_KILL)
(void) manager_varlink_send_managed_oom_update(u);
return 1;
}
if (streq(name, "ManagedOOMPreference")) {
ManagedOOMPreference p;
const char *pref;

View File

@ -328,6 +328,10 @@ static int exec_cgroup_context_serialize(const CGroupContext *c, FILE *f) {
if (r < 0)
return r;
r = serialize_usec(f, "exec-cgroup-context-managed-oom-memory-pressure-duration-usec", c->moom_mem_pressure_duration_usec);
if (r < 0)
return r;
r = serialize_item(f, "exec-cgroup-context-managed-oom-preference", managed_oom_preference_to_string(c->moom_preference));
if (r < 0)
return r;
@ -781,6 +785,10 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) {
c->moom_preference = managed_oom_preference_from_string(val);
if (c->moom_preference < 0)
return -EINVAL;
} else if ((val = startswith(l, "exec-cgroup-context-managed-oom-memory-pressure-duration-usec="))) {
r = deserialize_usec(val, &c->moom_mem_pressure_duration_usec);
if (r < 0)
return r;
} else if ((val = startswith(l, "exec-cgroup-context-memory-pressure-watch="))) {
c->memory_pressure_watch = cgroup_pressure_watch_from_string(val);
if (c->memory_pressure_watch < 0)

View File

@ -253,6 +253,7 @@
{{type}}.ManagedOOMSwap, config_parse_managed_oom_mode, 0, offsetof({{type}}, cgroup_context.moom_swap)
{{type}}.ManagedOOMMemoryPressure, config_parse_managed_oom_mode, 0, offsetof({{type}}, cgroup_context.moom_mem_pressure)
{{type}}.ManagedOOMMemoryPressureLimit, config_parse_managed_oom_mem_pressure_limit, 0, offsetof({{type}}, cgroup_context.moom_mem_pressure_limit)
{{type}}.ManagedOOMMemoryPressureDurationSec, config_parse_managed_oom_mem_pressure_duration_sec, 0, offsetof({{type}}, cgroup_context.moom_mem_pressure_duration_usec)
{{type}}.ManagedOOMPreference, config_parse_managed_oom_preference, 0, offsetof({{type}}, cgroup_context.moom_preference)
{{type}}.NetClass, config_parse_warn_compat, DISABLED_LEGACY, 0
{{type}}.BPFProgram, config_parse_bpf_foreign_program, 0, offsetof({{type}}, cgroup_context)

View File

@ -4121,6 +4121,44 @@ int config_parse_managed_oom_mem_pressure_limit(
return 0;
}
int config_parse_managed_oom_mem_pressure_duration_sec(
const char *unit,
const char *filename,
unsigned line,
const char *section,
unsigned section_line,
const char *lvalue,
int ltype,
const char *rvalue,
void *data,
void *userdata) {
usec_t usec, *duration = ASSERT_PTR(data);
UnitType t;
int r;
t = unit_name_to_type(unit);
assert(t != _UNIT_TYPE_INVALID);
if (!unit_vtable[t]->can_set_managed_oom)
return log_syntax(unit, LOG_WARNING, filename, line, 0, "%s= is not supported for this unit type, ignoring.", lvalue);
if (isempty(rvalue)) {
*duration = USEC_INFINITY;
return 0;
}
r = parse_sec(rvalue, &usec);
if (r < 0)
return log_syntax_parse_error(unit, filename, line, r, lvalue, rvalue);
if (usec < 1 * USEC_PER_SEC || usec == USEC_INFINITY)
return log_syntax(unit, LOG_WARNING, filename, line, 0, "%s= must be at least 1s and less than infinity, ignoring: %s", lvalue, rvalue);
*duration = usec;
return 0;
}
int config_parse_device_allow(
const char *unit,
const char *filename,

View File

@ -88,6 +88,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_delegate);
CONFIG_PARSER_PROTOTYPE(config_parse_delegate_subgroup);
CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mode);
CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mem_pressure_limit);
CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mem_pressure_duration_sec);
CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_preference);
CONFIG_PARSER_PROTOTYPE(config_parse_device_policy);
CONFIG_PARSER_PROTOTYPE(config_parse_device_allow);

View File

@ -24,6 +24,7 @@ typedef struct ManagedOOMMessage {
char *path;
char *property;
uint32_t limit;
usec_t duration;
} ManagedOOMMessage;
static void managed_oom_message_destroy(ManagedOOMMessage *message) {
@ -43,6 +44,7 @@ static int process_managed_oom_message(Manager *m, uid_t uid, sd_json_variant *p
{ "path", SD_JSON_VARIANT_STRING, sd_json_dispatch_string, offsetof(ManagedOOMMessage, path), SD_JSON_MANDATORY },
{ "property", SD_JSON_VARIANT_STRING, sd_json_dispatch_string, offsetof(ManagedOOMMessage, property), SD_JSON_MANDATORY },
{ "limit", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint32, offsetof(ManagedOOMMessage, limit), 0 },
{ "duration", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint64, offsetof(ManagedOOMMessage, duration), 0 },
{},
};
@ -55,10 +57,13 @@ static int process_managed_oom_message(Manager *m, uid_t uid, sd_json_variant *p
/* Skip malformed elements and keep processing in case the others are good */
JSON_VARIANT_ARRAY_FOREACH(c, cgroups) {
_cleanup_(managed_oom_message_destroy) ManagedOOMMessage message = {};
_cleanup_(managed_oom_message_destroy) ManagedOOMMessage message = {
.duration = USEC_INFINITY,
};
OomdCGroupContext *ctx;
Hashmap *monitor_hm;
loadavg_t limit;
usec_t duration;
if (!sd_json_variant_is_object(c))
continue;
@ -104,6 +109,11 @@ static int process_managed_oom_message(Manager *m, uid_t uid, sd_json_variant *p
continue;
}
if (streq(message.property, "ManagedOOMMemoryPressure") && message.duration != USEC_INFINITY)
duration = message.duration;
else
duration = m->default_mem_pressure_duration_usec;
r = oomd_insert_cgroup_context(NULL, monitor_hm, message.path);
if (r == -ENOMEM)
return r;
@ -113,8 +123,10 @@ static int process_managed_oom_message(Manager *m, uid_t uid, sd_json_variant *p
/* Always update the limit in case it was changed. For non-memory pressure detection the value is
* ignored so always updating it here is not a problem. */
ctx = hashmap_get(monitor_hm, empty_to_root(message.path));
if (ctx)
if (ctx) {
ctx->mem_pressure_limit = limit;
ctx->mem_pressure_duration_usec = duration;
}
}
/* Toggle wake-ups for "ManagedOOMSwap" if entries are present. */
@ -472,7 +484,7 @@ static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t
m->mem_pressure_post_action_delay_start = 0;
}
r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets);
r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, &targets);
if (r == -ENOMEM)
return log_oom();
if (r < 0)
@ -494,7 +506,7 @@ static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t
t->path,
LOADAVG_INT_SIDE(t->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(t->memory_pressure.avg10),
LOADAVG_INT_SIDE(t->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(t->mem_pressure_limit),
FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC));
FORMAT_TIMESPAN(t->mem_pressure_duration_usec, USEC_PER_SEC));
r = update_monitored_cgroup_contexts_candidates(
m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates);
@ -526,7 +538,7 @@ static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t
selected, t->path,
LOADAVG_INT_SIDE(t->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(t->memory_pressure.avg10),
LOADAVG_INT_SIDE(t->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(t->mem_pressure_limit),
FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC));
FORMAT_TIMESPAN(t->mem_pressure_duration_usec, USEC_PER_SEC));
/* send dbus signal */
(void) sd_bus_emit_signal(m->bus,

View File

@ -69,7 +69,7 @@ OomdCGroupContext *oomd_cgroup_context_free(OomdCGroupContext *ctx) {
return mfree(ctx);
}
int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret) {
int oomd_pressure_above(Hashmap *h, Set **ret) {
_cleanup_set_free_ Set *targets = NULL;
OomdCGroupContext *ctx;
char *key;
@ -90,7 +90,7 @@ int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret) {
ctx->mem_pressure_limit_hit_start = now(CLOCK_MONOTONIC);
diff = now(CLOCK_MONOTONIC) - ctx->mem_pressure_limit_hit_start;
if (diff >= duration) {
if (diff >= ctx->mem_pressure_duration_usec) {
r = set_put(targets, ctx);
if (r < 0)
return -ENOMEM;
@ -564,6 +564,7 @@ int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path)
curr_ctx->last_pgscan = old_ctx->pgscan;
curr_ctx->mem_pressure_limit = old_ctx->mem_pressure_limit;
curr_ctx->mem_pressure_limit_hit_start = old_ctx->mem_pressure_limit_hit_start;
curr_ctx->mem_pressure_duration_usec = old_ctx->mem_pressure_duration_usec;
curr_ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim;
}
@ -594,6 +595,7 @@ void oomd_update_cgroup_contexts_between_hashmaps(Hashmap *old_h, Hashmap *curr_
ctx->last_pgscan = old_ctx->pgscan;
ctx->mem_pressure_limit = old_ctx->mem_pressure_limit;
ctx->mem_pressure_limit_hit_start = old_ctx->mem_pressure_limit_hit_start;
ctx->mem_pressure_duration_usec = old_ctx->mem_pressure_duration_usec;
ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim;
if (oomd_pgscan_rate(ctx) > 0)
@ -626,10 +628,12 @@ void oomd_dump_memory_pressure_cgroup_context(const OomdCGroupContext *ctx, FILE
fprintf(f,
"%sPath: %s\n"
"%s\tMemory Pressure Limit: %lu.%02lu%%\n"
"%s\tMemory Pressure Duration: %s\n"
"%s\tPressure: Avg10: %lu.%02lu, Avg60: %lu.%02lu, Avg300: %lu.%02lu, Total: %s\n"
"%s\tCurrent Memory Usage: %s\n",
strempty(prefix), ctx->path,
strempty(prefix), LOADAVG_INT_SIDE(ctx->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(ctx->mem_pressure_limit),
strempty(prefix), FORMAT_TIMESPAN(ctx->mem_pressure_duration_usec, USEC_PER_SEC),
strempty(prefix),
LOADAVG_INT_SIDE(ctx->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(ctx->memory_pressure.avg10),
LOADAVG_INT_SIDE(ctx->memory_pressure.avg60), LOADAVG_DECIMAL_SIDE(ctx->memory_pressure.avg60),

View File

@ -37,6 +37,7 @@ struct OomdCGroupContext {
loadavg_t mem_pressure_limit;
usec_t mem_pressure_limit_hit_start;
usec_t last_had_mem_reclaim;
usec_t mem_pressure_duration_usec;
};
struct OomdSystemContext {
@ -53,12 +54,12 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(OomdCGroupContext*, oomd_cgroup_context_free);
* key: cgroup paths -> value: OomdCGroupContext. */
/* Scans all the OomdCGroupContexts in `h` and returns 1 and a set of pointers to those OomdCGroupContexts in `ret`
* if any of them have exceeded their supplied memory pressure limits for the `duration` length of time.
* if any of them have exceeded their supplied memory pressure limits for the `ctx->mem_pressure_duration_usec` length of time.
* `mem_pressure_limit_hit_start` is updated accordingly for the first time the limit is exceeded, and when it returns
* below the limit.
* Returns 0 and sets `ret` to an empty set if no entries exceeded limits for `duration`.
* Returns 0 and sets `ret` to an empty set if no entries exceeded limits for `ctx->mem_pressure_duration_usec`.
* Returns -ENOMEM for allocation errors. */
int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret);
int oomd_pressure_above(Hashmap *h, Set **ret);
/* Returns true if the amount of memory available (see proc(5)) is below the permyriad of memory specified by `threshold_permyriad`. */
bool oomd_mem_available_below(const OomdSystemContext *ctx, int threshold_permyriad);

View File

@ -138,6 +138,7 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
c1->pgscan = UINT64_MAX;
c1->mem_pressure_limit = 6789;
c1->mem_pressure_limit_hit_start = 42;
c1->mem_pressure_duration_usec = 1234;
c1->last_had_mem_reclaim = 888;
assert_se(h2 = hashmap_new(&oomd_cgroup_ctx_hash_ops));
assert_se(oomd_insert_cgroup_context(h1, h2, cgroup) == 0);
@ -149,6 +150,7 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
assert_se(c2->last_pgscan == UINT64_MAX);
assert_se(c2->mem_pressure_limit == 6789);
assert_se(c2->mem_pressure_limit_hit_start == 42);
assert_se(c2->mem_pressure_duration_usec == 1234);
assert_se(c2->last_had_mem_reclaim == 888); /* assumes the live pgscan is less than UINT64_MAX */
}
@ -162,11 +164,13 @@ static void test_oomd_update_cgroup_contexts_between_hashmaps(void) {
{ .path = paths[0],
.mem_pressure_limit = 5,
.mem_pressure_limit_hit_start = 777,
.mem_pressure_duration_usec = 111,
.last_had_mem_reclaim = 888,
.pgscan = 57 },
{ .path = paths[1],
.mem_pressure_limit = 6,
.mem_pressure_limit_hit_start = 888,
.mem_pressure_duration_usec = 222,
.last_had_mem_reclaim = 888,
.pgscan = 42 },
};
@ -193,6 +197,7 @@ static void test_oomd_update_cgroup_contexts_between_hashmaps(void) {
assert_se(c_old->pgscan == c_new->last_pgscan);
assert_se(c_old->mem_pressure_limit == c_new->mem_pressure_limit);
assert_se(c_old->mem_pressure_limit_hit_start == c_new->mem_pressure_limit_hit_start);
assert_se(c_old->mem_pressure_duration_usec == c_new->mem_pressure_duration_usec);
assert_se(c_old->last_had_mem_reclaim == c_new->last_had_mem_reclaim);
assert_se(c_old = hashmap_get(h_old, "/1.slice"));
@ -200,6 +205,7 @@ static void test_oomd_update_cgroup_contexts_between_hashmaps(void) {
assert_se(c_old->pgscan == c_new->last_pgscan);
assert_se(c_old->mem_pressure_limit == c_new->mem_pressure_limit);
assert_se(c_old->mem_pressure_limit_hit_start == c_new->mem_pressure_limit_hit_start);
assert_se(c_old->mem_pressure_duration_usec == c_new->mem_pressure_duration_usec);
assert_se(c_new->last_had_mem_reclaim > c_old->last_had_mem_reclaim);
}
@ -255,17 +261,21 @@ static void test_oomd_pressure_above(void) {
assert_se(store_loadavg_fixed_point(99, 99, &(ctx[0].memory_pressure.avg60)) == 0);
assert_se(store_loadavg_fixed_point(99, 99, &(ctx[0].memory_pressure.avg300)) == 0);
ctx[0].mem_pressure_limit = threshold;
/* Set memory pressure duration to 0 since we use the real system monotonic clock
* in oomd_pressure_above() and we want to avoid this test depending on timing. */
ctx[0].mem_pressure_duration_usec = 0;
/* /derp.slice */
assert_se(store_loadavg_fixed_point(1, 11, &(ctx[1].memory_pressure.avg10)) == 0);
assert_se(store_loadavg_fixed_point(1, 11, &(ctx[1].memory_pressure.avg60)) == 0);
assert_se(store_loadavg_fixed_point(1, 11, &(ctx[1].memory_pressure.avg300)) == 0);
ctx[1].mem_pressure_limit = threshold;
ctx[1].mem_pressure_duration_usec = 0;
/* High memory pressure */
assert_se(h1 = hashmap_new(&string_hash_ops));
assert_se(hashmap_put(h1, "/herp.slice", &ctx[0]) >= 0);
assert_se(oomd_pressure_above(h1, 0 /* duration */, &t1) == 1);
assert_se(oomd_pressure_above(h1, &t1) == 1);
assert_se(set_contains(t1, &ctx[0]));
assert_se(c = hashmap_get(h1, "/herp.slice"));
assert_se(c->mem_pressure_limit_hit_start > 0);
@ -273,14 +283,14 @@ static void test_oomd_pressure_above(void) {
/* Low memory pressure */
assert_se(h2 = hashmap_new(&string_hash_ops));
assert_se(hashmap_put(h2, "/derp.slice", &ctx[1]) >= 0);
assert_se(oomd_pressure_above(h2, 0 /* duration */, &t2) == 0);
assert_se(oomd_pressure_above(h2, &t2) == 0);
assert_se(!t2);
assert_se(c = hashmap_get(h2, "/derp.slice"));
assert_se(c->mem_pressure_limit_hit_start == 0);
/* High memory pressure w/ multiple cgroups */
assert_se(hashmap_put(h1, "/derp.slice", &ctx[1]) >= 0);
assert_se(oomd_pressure_above(h1, 0 /* duration */, &t3) == 1);
assert_se(oomd_pressure_above(h1, &t3) == 1);
assert_se(set_contains(t3, &ctx[0]));
assert_se(set_size(t3) == 1);
assert_se(c = hashmap_get(h1, "/herp.slice"));

View File

@ -109,6 +109,12 @@ static int bus_print_property(const char *name, const char *expected_value, sd_b
bus_print_property_value(name, expected_value, flags, FORMAT_TIMESTAMP(u));
/* Managed OOM pressure default implies "unset" and use the default set in oomd.conf. Without
* this condition, we will print "infinity" which implies there is no limit on memory
* pressure duration and is incorrect. */
else if (streq(name, "ManagedOOMMemoryPressureDurationUSec") && u == USEC_INFINITY)
bus_print_property_value(name, expected_value, flags, "[not set]");
else if (strstr(name, "USec"))
bus_print_property_value(name, expected_value, flags, FORMAT_TIMESPAN(u, 0));

View File

@ -1008,6 +1008,11 @@ static int bus_append_cgroup_property(sd_bus_message *m, const char *field, cons
if (streq(field, "NFTSet"))
return bus_append_nft_set(m, field, eq);
if (streq(field, "ManagedOOMMemoryPressureDurationSec"))
/* While infinity is disallowed in unit file, infinity is allowed in D-Bus API which
* means use the default memory pressure duration from oomd.conf. */
return bus_append_parse_sec_rename(m, field, isempty(eq) ? "infinity" : eq);
return 0;
}

View File

@ -12,7 +12,8 @@ SD_VARLINK_DEFINE_STRUCT_TYPE(
SD_VARLINK_DEFINE_FIELD(mode, SD_VARLINK_STRING, 0),
SD_VARLINK_DEFINE_FIELD(path, SD_VARLINK_STRING, 0),
SD_VARLINK_DEFINE_FIELD(property, SD_VARLINK_STRING, 0),
SD_VARLINK_DEFINE_FIELD(limit, SD_VARLINK_INT, SD_VARLINK_NULLABLE));
SD_VARLINK_DEFINE_FIELD(limit, SD_VARLINK_INT, SD_VARLINK_NULLABLE),
SD_VARLINK_DEFINE_FIELD(duration, SD_VARLINK_INT, SD_VARLINK_NULLABLE));
static SD_VARLINK_DEFINE_METHOD(
ReportManagedOOMCGroups,

View File

@ -154,6 +154,7 @@ MaxConnectionsPerSource=
ManagedOOMSwap=
ManagedOOMMemoryPressure=
ManagedOOMMemoryPressureLimitPercent=
ManagedOOMMemoryPressureDurationSec=
ManagedOOMPreference=
MemoryAccounting=
MemoryHigh=

View File

@ -106,7 +106,7 @@ test_basic() {
# Verify systemd-oomd is monitoring the expected units.
timeout 1m bash -xec "until oomctl | grep -q -F 'Path: $cgroup_path'; do sleep 1; done"
assert_in 'Memory Pressure Limit: 20.00%' \
"$(oomctl | tac | sed -e '/Memory Pressure Monitored CGroups:/q' | tac | grep -A7 "Path: $cgroup_path")"
"$(oomctl | tac | sed -e '/Memory Pressure Monitored CGroups:/q' | tac | grep -A8 "Path: $cgroup_path")"
systemctl "$@" start TEST-55-OOMD-testbloat.service
@ -181,6 +181,86 @@ EOF
systemctl stop TEST-55-OOMD-testmunch.service
systemctl stop TEST-55-OOMD-testchill.service
systemctl stop TEST-55-OOMD-workload.slice
# clean up overrides since test cases can be run in any order
# and overrides shouldn't affect other tests
rm -rf /run/systemd/system/TEST-55-OOMD-testbloat.service.d
systemctl daemon-reload
}
testcase_duration_analyze() {
# Verify memory pressure duration is valid if >= 1 second
cat <<EOF >/tmp/TEST-55-OOMD-valid-duration.service
[Service]
ExecStart=echo hello
ManagedOOMMemoryPressureDurationSec=1s
EOF
# Verify memory pressure duration is invalid if < 1 second
cat <<EOF >/tmp/TEST-55-OOMD-invalid-duration.service
[Service]
ExecStart=echo hello
ManagedOOMMemoryPressureDurationSec=0
EOF
systemd-analyze --recursive-errors=no verify /tmp/TEST-55-OOMD-valid-duration.service
(! systemd-analyze --recursive-errors=no verify /tmp/TEST-55-OOMD-invalid-duration.service)
rm -f /tmp/TEST-55-OOMD-valid-duration.service
rm -f /tmp/TEST-55-OOMD-invalid-duration.service
}
testcase_duration_override() {
# Verify memory pressure duration can be overriden to non-zero values
mkdir -p /run/systemd/system/TEST-55-OOMD-testmunch.service.d/
cat >/run/systemd/system/TEST-55-OOMD-testmunch.service.d/99-duration-test.conf <<EOF
[Service]
ManagedOOMMemoryPressureDurationSec=3s
ManagedOOMMemoryPressure=kill
EOF
# Verify memory pressure duration will use default if set to empty
mkdir -p /run/systemd/system/TEST-55-OOMD-testchill.service.d/
cat >/run/systemd/system/TEST-55-OOMD-testchill.service.d/99-duration-test.conf <<EOF
[Service]
ManagedOOMMemoryPressureDurationSec=
ManagedOOMMemoryPressure=kill
EOF
systemctl daemon-reload
systemctl start TEST-55-OOMD-testmunch.service
systemctl start TEST-55-OOMD-testchill.service
timeout 1m bash -xec 'until oomctl | grep "/TEST-55-OOMD-testmunch.service"; do sleep 1; done'
oomctl | grep -A 2 "/TEST-55-OOMD-testmunch.service" | grep "Memory Pressure Duration: 3s"
timeout 1m bash -xec 'until oomctl | grep "/TEST-55-OOMD-testchill.service"; do sleep 1; done'
oomctl | grep -A 2 "/TEST-55-OOMD-testchill.service" | grep "Memory Pressure Duration: 2s"
[[ "$(systemctl show -P ManagedOOMMemoryPressureDurationUSec TEST-55-OOMD-testmunch.service)" == "3s" ]]
[[ "$(systemctl show -P ManagedOOMMemoryPressureDurationUSec TEST-55-OOMD-testchill.service)" == "[not set]" ]]
for _ in {0..59}; do
if ! systemctl status TEST-55-OOMD-testmunch.service; then
break
fi
oomctl
sleep 2
done
if systemctl status TEST-55-OOMD-testmunch.service; then exit 44; fi
if ! systemctl status TEST-55-OOMD-testchill.service; then exit 23; fi
systemctl kill --signal=KILL TEST-55-OOMD-testmunch.service || :
systemctl stop TEST-55-OOMD-testmunch.service
systemctl stop TEST-55-OOMD-testchill.service
systemctl stop TEST-55-OOMD-workload.slice
# clean up overrides since test cases can be run in any order
# and overrides shouldn't affect other tests
rm -rf /run/systemd/system/TEST-55-OOMD-testmunch.service.d
rm -rf /run/systemd/system/TEST-55-OOMD-testchill.service.d
systemctl daemon-reload
}
run_testcases