emit a warning in networkd if managed sysctls are changed

Monitor the sysctl set by networkd for writes, if a sysctl is
overwritten with a different value than the one we set, emit a warning.
Writes are detected with an eBPF program attached as BPF_CGROUP_SYSCTL
which reports the sysctl writes only in net/.

The eBPF program only reports sysctl writes from a different cgroup than networkd.
To do this, it uses the `bpf_current_task_under_cgroup_proto()` helper,
which will be available allowed in BPF_CGROUP_SYSCTL from kernel 6.12[1].

Loading a BPF_CGROUP_SYSCTL program requires the CAP_SYS_ADMIN capability,
so drop it just after the program load, whether it loads successfully or not.

Writes are logged but permitted, in future the functionality can be
extended to also deny writes to managed sysctls.

[1] https://lore.kernel.org/bpf/20240819162805.78235-3-technoboy85@gmail.com/
This commit is contained in:
Matteo Croce 2024-07-01 21:58:30 +02:00
parent 64629617b6
commit 6d9ef22acd
14 changed files with 469 additions and 4 deletions

View File

@ -794,3 +794,12 @@ the TPM.
Automatic SRK enrollment on TPMs in such scenarios is not supported. In order to unset the PIN/password
protection on the owner hierarchy issue a command like the following: 'tpm2_changeauth -c o -p <OLDPW> ""'.
-- 9cf56b8baf9546cf9478783a8de42113
Subject: A foreign process changed a sysctl we manage
Defined-By: systemd
Support: %SUPPORT_URL%
A sysctl handle under /proc/sys/net, which is managed by systemd-networkd, has been changed by another process.
The event is raised only if the written value differs from the current one.
The program name, the written value, the previous value, and the value initially set by networkd have been logged.

View File

@ -0,0 +1,25 @@
# SPDX-License-Identifier: LGPL-2.1-or-later
if conf.get('HAVE_VMLINUX_H') != 1
subdir_done()
endif
sysctl_monitor_bpf_o_unstripped = custom_target(
'sysctl-monitor.bpf.unstripped.o',
input : 'sysctl-monitor.bpf.c',
output : 'sysctl-monitor.bpf.unstripped.o',
command : bpf_o_unstripped_cmd,
depends : vmlinux_h_dependency)
sysctl_monitor_bpf_o = custom_target(
'sysctl-monitor.bpf.o',
input : sysctl_monitor_bpf_o_unstripped,
output : 'sysctl-monitor.bpf.o',
command : bpf_o_cmd)
sysctl_monitor_skel_h = custom_target(
'sysctl-monitor.skel.h',
input : sysctl_monitor_bpf_o,
output : 'sysctl-monitor.skel.h',
command : skel_h_cmd,
capture : true)

View File

@ -0,0 +1,16 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
/* The SPDX header above is actually correct in claiming this was
* LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
* compatible with GPL we will claim this to be GPL however, which should be
* fine given that LGPL-2.1-or-later downgrades to GPL if needed.
*/
#include "bpf-dlopen.h"
/* libbpf is used via dlopen(), so rename symbols */
#define bpf_object__destroy_skeleton sym_bpf_object__destroy_skeleton
#define bpf_object__load_skeleton sym_bpf_object__load_skeleton
#define bpf_object__open_skeleton sym_bpf_object__open_skeleton
#include "bpf/sysctl_monitor/sysctl-monitor.skel.h"

View File

@ -0,0 +1,134 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include "sysctl-write-event.h"
struct {
__uint(type, BPF_MAP_TYPE_CGROUP_ARRAY);
__type(key, u32);
__type(value, u32);
__uint(max_entries, 1);
} cgroup_map SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_RINGBUF);
__uint(max_entries, 256 * 1024);
} written_sysctls SEC(".maps");
static bool my_streq(const char *s1, const char *s2, size_t l) {
for (size_t i = 0; i < l; i++) {
if (s1[i] != s2[i])
return false;
if (s1[i] == 0)
return true;
}
return true;
}
struct str {
char *s;
size_t l;
};
static long cut_last(u32 i, struct str *str) {
char *s;
i = str->l - i - 1;
s = str->s + i;
/* Sanity check for the preverifier */
if (i >= str->l)
return 1;
if (*s == 0)
return 0;
if (*s == '\n' || *s == '\r' || *s == ' ' || *s == '\t') {
*s = 0;
return 0;
}
return 1;
}
/* Cut off trailing whitespace and newlines */
static void chop(char *s, size_t l) {
struct str str = { s, l };
bpf_loop(l, cut_last, &str, 0);
}
SEC("cgroup/sysctl")
int sysctl_monitor(struct bpf_sysctl *ctx) {
int r;
/* Ignore events generated by us */
if (bpf_current_task_under_cgroup(&cgroup_map, 0))
return 1;
/* Allow reads */
if (!ctx->write)
return 1;
/* Declare the struct without contextually initializing it.
* This avoid zero-filling the struct, which would be a waste of
* resource and code size. Since we're sending an event even on failure,
* truncate the strings to zero size, in case we don't populate them. */
struct sysctl_write_event we;
we.version = 1;
we.errorcode = 0;
we.path[0] = 0;
we.comm[0] = 0;
we.current[0] = 0;
we.newvalue[0] = 0;
/* Set the simple values first */
we.pid = bpf_get_current_pid_tgid() >> 32;
we.cgroup_id = bpf_get_current_cgroup_id();
/* Only monitor /proc/sys/net/ */
r = bpf_sysctl_get_name(ctx, we.path, sizeof(we.path), 0);
if (r < 0) {
we.errorcode = r;
goto send_event;
}
if (bpf_strncmp(we.path, 4, "net/") != 0)
return 1;
r = bpf_get_current_comm(we.comm, sizeof(we.comm));
if (r < 0) {
we.errorcode = r;
goto send_event;
}
r = bpf_sysctl_get_current_value(ctx, we.current, sizeof(we.current));
if (r < 0) {
we.errorcode = r;
goto send_event;
}
r = bpf_sysctl_get_new_value(ctx, we.newvalue, sizeof(we.newvalue));
if (r < 0) {
we.errorcode = r;
goto send_event;
}
/* Both the kernel and userspace applications add a newline at the end,
* remove it from both strings */
chop(we.current, sizeof(we.current));
chop(we.newvalue, sizeof(we.newvalue));
send_event:
/* If new value differs or we encountered an error, send the event */
if (r < 0 || !my_streq(we.current, we.newvalue, sizeof(we.current)))
bpf_ringbuf_output(&written_sysctls, &we, sizeof(we), 0);
return 1;
}
char _license[] SEC("license") = "GPL";

View File

@ -0,0 +1,46 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#pragma once
#ifndef TASK_COMM_LEN
#define TASK_COMM_LEN 16
#endif
/* It would be nice to size these members to bigger values, but the stack
* in BPF programs is limited to 512 bytes, and allocating bigger structures
* leads to this compile time error:
* error: Looks like the BPF stack limit is exceeded.
* Please move large on stack variables into BPF per-cpu array map.
* For non-kernel uses, the stack can be increased using -mllvm -bpf-stack-size. */
struct sysctl_write_event {
/* Used to track changes in the struct layout */
int version;
/* Error code returned to userspace to handle eventual failures. */
int errorcode;
/* The PID of the process which is writing the sysctl. */
pid_t pid;
/* The cgroup id of the process. */
uint64_t cgroup_id;
/* The name of the binary. */
char comm[TASK_COMM_LEN];
/* The path of the sysctl, relative to /proc/sys/.
* The longest path observed is 64 bytes:
* net/ipv4/conf/123456789012345/igmpv3_unsolicited_report_interval
* so set it to 100 gives us lot of headroom */
char path[100];
/* The value of the sysctl just before the write.
* The longest value observed is net.core.netdev_rss_key which
* contains 155 bytes, so set it to 160 to have some headroom
* even in this corner case. */
char current[160];
/* The new value being written into the sysctl.
* same sizing as 'current' */
char newvalue[160];
};

View File

@ -1,5 +1,7 @@
# SPDX-License-Identifier: LGPL-2.1-or-later
subdir('bpf/sysctl_monitor')
sources = files(
'netdev/bareudp.c',
'netdev/batadv.c',
@ -140,6 +142,10 @@ network_generator_sources = files(
networkd_network_gperf_gperf = files('networkd-network-gperf.gperf')
networkd_netdev_gperf_gperf = files('netdev/netdev-gperf.gperf')
if conf.get('HAVE_VMLINUX_H') == 1
sources += sysctl_monitor_skel_h
endif
sources += custom_target(
'networkd-gperf.c',
input : 'networkd-gperf.gperf',

View File

@ -252,6 +252,8 @@ static void link_free_engines(Link *link) {
static Link *link_free(Link *link) {
assert(link);
(void) sysctl_clear_link_shadows(link);
link_ntp_settings_clear(link);
link_dns_settings_clear(link);

View File

@ -16,6 +16,7 @@
#include "bus-log-control-api.h"
#include "bus-polkit.h"
#include "bus-util.h"
#include "capability-util.h"
#include "common-signal.h"
#include "conf-parser.h"
#include "constants.h"
@ -603,6 +604,7 @@ int manager_new(Manager **ret, bool test_mode) {
.duid_product_uuid.type = DUID_TYPE_UUID,
.dhcp_server_persist_leases = true,
.ip_forwarding = { -1, -1, },
.cgroup_fd = -EBADF,
};
*ret = TAKE_PTR(m);
@ -615,6 +617,8 @@ Manager* manager_free(Manager *m) {
if (!m)
return NULL;
sysctl_remove_monitor(m);
free(m->state_file);
HASHMAP_FOREACH(link, m->links_by_index)
@ -694,6 +698,18 @@ int manager_start(Manager *m) {
assert(m);
(void) sysctl_add_monitor(m);
/* Loading BPF programs requires CAP_SYS_ADMIN and CAP_BPF.
* Drop the capabilities here, regardless if the load succeeds or not. */
r = drop_capability(CAP_SYS_ADMIN);
if (r < 0)
log_warning_errno(r, "Failed to drop CAP_SYS_ADMIN: %m, ignoring.");
r = drop_capability(CAP_BPF);
if (r < 0)
log_warning_errno(r, "Failed to drop CAP_BPF: %m, ignoring.");
manager_set_sysctl(m);
r = manager_request_static_address_labels(m);

View File

@ -123,6 +123,11 @@ struct Manager {
/* sysctl */
int ip_forwarding[2];
Hashmap *sysctl_shadow;
sd_event_source *sysctl_event_source;
struct ring_buffer *sysctl_buffer;
struct sysctl_monitor_bpf *sysctl_skel;
struct bpf_link *sysctl_link;
int cgroup_fd;
};
int manager_new(Manager **ret, bool test_mode);

View File

@ -4,7 +4,11 @@
#include <linux/if.h>
#include <linux/if_arp.h>
#include "sd-messages.h"
#include "af-list.h"
#include "cgroup-util.h"
#include "fd-util.h"
#include "missing_network.h"
#include "networkd-link.h"
#include "networkd-lldp-tx.h"
@ -12,10 +16,197 @@
#include "networkd-ndisc.h"
#include "networkd-network.h"
#include "networkd-sysctl.h"
#include "path-util.h"
#include "socket-util.h"
#include "string-table.h"
#include "sysctl-util.h"
#if HAVE_VMLINUX_H
#include "bpf-link.h"
#include "bpf/sysctl_monitor/sysctl-monitor-skel.h"
#include "bpf/sysctl_monitor/sysctl-write-event.h"
static struct sysctl_monitor_bpf *sysctl_monitor_bpf_free(struct sysctl_monitor_bpf *obj) {
sysctl_monitor_bpf__destroy(obj);
return NULL;
}
static struct ring_buffer *rb_free(struct ring_buffer *rb) {
sym_ring_buffer__free(rb);
return NULL;
}
DEFINE_TRIVIAL_CLEANUP_FUNC(struct sysctl_monitor_bpf *, sysctl_monitor_bpf_free);
DEFINE_TRIVIAL_CLEANUP_FUNC(struct ring_buffer *, rb_free);
static int sysctl_event_handler(void *ctx, void *data, size_t data_sz) {
struct sysctl_write_event *we = ASSERT_PTR(data);
Hashmap **sysctl_shadow = ASSERT_PTR(ctx);
_cleanup_free_ char *path = NULL;
char *value;
/* Returning a negative value interrupts the ring buffer polling,
* so do it only in case of a fatal error like a version mismatch. */
if (we->version != 1)
return log_warning_errno(SYNTHETIC_ERRNO(EINVAL),
"Unexpected sysctl event, disabling sysctl monitoring: %d", we->version);
if (we->errorcode != 0) {
log_warning_errno(we->errorcode, "Sysctl monitor BPF returned error: %m");
return 0;
}
path = path_join("/proc/sys", we->path);
if (!path) {
log_oom();
return 0;
}
/* If we never managed this handle, ignore it. */
value = hashmap_get(*sysctl_shadow, path);
if (!value)
return 0;
if (!strneq(value, we->newvalue, sizeof(we->newvalue)))
log_struct(LOG_WARNING,
"MESSAGE_ID=" SD_MESSAGE_SYSCTL_CHANGED_STR,
"OBJECT_PID=%d", we->pid,
"OBJECT_COMM=%s", we->comm,
"SYSCTL=/proc/sys/%s", we->path,
"OLDVALUE=%s", we->current,
"NEWVALUE=%s", we->newvalue,
"OURVALUE=%s", value,
LOG_MESSAGE("Foreign process '%s[%d]' changed sysctl '/proc/sys/%s' from '%s' to '%s', conflicting with our setting to '%s'",
we->comm, we->pid, we->path, we->current, we->newvalue, value));
return 0;
}
static int on_ringbuf_io(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
struct ring_buffer *rb = ASSERT_PTR(userdata);
int r;
r = sym_ring_buffer__poll(rb, /* timeout_msec= */ 0);
if (r < 0 && errno != EINTR)
log_error_errno(errno, "Error polling ring buffer: %m");
return 0;
}
int sysctl_add_monitor(Manager *manager) {
_cleanup_(sysctl_monitor_bpf_freep) struct sysctl_monitor_bpf *obj = NULL;
_cleanup_(bpf_link_freep) struct bpf_link *sysctl_link = NULL;
_cleanup_(rb_freep) struct ring_buffer *sysctl_buffer = NULL;
_cleanup_close_ int cgroup_fd = -EBADF, rootcg = -EBADF;
_cleanup_free_ char *cgroup = NULL;
int idx = 0, r;
assert(manager);
r = dlopen_bpf();
if (r < 0) {
log_info_errno(r, "sysctl monitor disabled, as BPF support is not available.");
return 0;
}
r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
if (r < 0)
return log_warning_errno(r, "Failed to get cgroup path, ignoring: %m.");
rootcg = cg_path_open(SYSTEMD_CGROUP_CONTROLLER, "/");
if (rootcg < 0)
return log_warning_errno(rootcg, "Failed to open cgroup, ignoring: %m.");
obj = sysctl_monitor_bpf__open_and_load();
if (!obj) {
log_info_errno(errno, "Unable to load sysctl monitor BPF program, ignoring: %m.");
return 0;
}
cgroup_fd = cg_path_open(SYSTEMD_CGROUP_CONTROLLER, cgroup);
if (cgroup_fd < 0)
return log_warning_errno(cgroup_fd, "Failed to open cgroup: %m");
if (sym_bpf_map_update_elem(sym_bpf_map__fd(obj->maps.cgroup_map), &idx, &cgroup_fd, BPF_ANY))
return log_warning_errno(errno, "Failed to update cgroup map: %m");
sysctl_link = sym_bpf_program__attach_cgroup(obj->progs.sysctl_monitor, rootcg);
r = bpf_get_error_translated(sysctl_link);
if (r < 0) {
log_info_errno(r, "Unable to attach sysctl monitor BPF program to cgroup, ignoring: %m.");
return 0;
}
sysctl_buffer = sym_ring_buffer__new(
sym_bpf_map__fd(obj->maps.written_sysctls),
sysctl_event_handler, &manager->sysctl_shadow, NULL);
if (!sysctl_buffer)
return log_warning_errno(errno, "Failed to create ring buffer: %m");
r = sd_event_add_io(manager->event, &manager->sysctl_event_source,
sym_ring_buffer__epoll_fd(sysctl_buffer), EPOLLIN, on_ringbuf_io, sysctl_buffer);
if (r < 0)
return log_warning_errno(r, "Failed to watch sysctl event ringbuffer: %m");
manager->sysctl_link = TAKE_PTR(sysctl_link);
manager->sysctl_skel = TAKE_PTR(obj);
manager->sysctl_buffer = TAKE_PTR(sysctl_buffer);
manager->cgroup_fd = TAKE_FD(cgroup_fd);
return 0;
}
void sysctl_remove_monitor(Manager *manager) {
assert(manager);
manager->sysctl_event_source = sd_event_source_disable_unref(manager->sysctl_event_source);
if (manager->sysctl_buffer) {
sym_ring_buffer__free(manager->sysctl_buffer);
manager->sysctl_buffer = NULL;
}
if (manager->sysctl_link) {
sym_bpf_link__destroy(manager->sysctl_link);
manager->sysctl_link = NULL;
}
if (manager->sysctl_skel) {
sysctl_monitor_bpf__destroy(manager->sysctl_skel);
manager->sysctl_skel = NULL;
}
manager->cgroup_fd = safe_close(manager->cgroup_fd);
}
int sysctl_clear_link_shadows(Link *link) {
_cleanup_free_ char *ipv4 = NULL, *ipv6 = NULL;
char *key = NULL, *value = NULL;
assert(link);
assert(link->manager);
ipv4 = path_join("/proc/sys/net/ipv4/conf", link->ifname);
if (!ipv4)
return log_oom();
ipv6 = path_join("/proc/sys/net/ipv6/conf", link->ifname);
if (!ipv6)
return log_oom();
HASHMAP_FOREACH_KEY(value, key, link->manager->sysctl_shadow)
if (path_startswith(key, ipv4) || path_startswith(key, ipv6)) {
assert_se(hashmap_remove_value(link->manager->sysctl_shadow, key, value) == value);
free(key);
free(value);
}
return 0;
}
#endif
static void manager_set_ip_forwarding(Manager *manager, int family) {
int r, t;

View File

@ -27,6 +27,16 @@ typedef enum IPReversePathFilter {
_IP_REVERSE_PATH_FILTER_INVALID = -EINVAL,
} IPReversePathFilter;
#if HAVE_VMLINUX_H
int sysctl_add_monitor(Manager *manager);
void sysctl_remove_monitor(Manager *manager);
int sysctl_clear_link_shadows(Link *link);
#else
static inline int sysctl_add_monitor(Manager *manager) { return 0; }
static inline void sysctl_remove_monitor(Manager *manager) { }
static inline int sysctl_clear_link_shadows(Link *link) { return 0; }
#endif
void manager_set_sysctl(Manager *manager);
int link_get_ip_forwarding(Link *link, int family);

View File

@ -62,7 +62,9 @@ static int run(int argc, char *argv[]) {
(1ULL << CAP_NET_ADMIN) |
(1ULL << CAP_NET_BIND_SERVICE) |
(1ULL << CAP_NET_BROADCAST) |
(1ULL << CAP_NET_RAW));
(1ULL << CAP_NET_RAW) |
(1ULL << CAP_SYS_ADMIN) |
(1ULL << CAP_BPF));
if (r < 0)
return log_error_errno(r, "Failed to drop privileges: %m");
}

View File

@ -277,6 +277,9 @@ _SD_BEGIN_DECLARATIONS;
#define SD_MESSAGE_SRK_ENROLLMENT_NEEDS_AUTHORIZATION SD_ID128_MAKE(ad,70,89,f9,28,ac,4f,7e,a0,0c,07,45,7d,47,ba,8a)
#define SD_MESSAGE_SRK_ENROLLMENT_NEEDS_AUTHORIZATION_STR SD_ID128_MAKE_STR(ad,70,89,f9,28,ac,4f,7e,a0,0c,07,45,7d,47,ba,8a)
#define SD_MESSAGE_SYSCTL_CHANGED SD_ID128_MAKE(9c,f5,6b,8b,af,95,46,cf,94,78,78,3a,8d,e4,21,13)
#define SD_MESSAGE_SYSCTL_CHANGED_STR SD_ID128_MAKE_STR(9c,f5,6b,8b,af,95,46,cf,94,78,78,3a,8d,e4,21,13)
_SD_END_DECLARATIONS;
#endif

View File

@ -20,9 +20,9 @@ Conflicts=shutdown.target initrd-switch-root.target
Wants=systemd-networkd.socket network.target systemd-networkd-persistent-storage.service
[Service]
AmbientCapabilities=CAP_NET_ADMIN CAP_NET_BIND_SERVICE CAP_NET_BROADCAST CAP_NET_RAW
AmbientCapabilities=CAP_NET_ADMIN CAP_NET_BIND_SERVICE CAP_NET_BROADCAST CAP_NET_RAW CAP_BPF CAP_SYS_ADMIN
BusName=org.freedesktop.network1
CapabilityBoundingSet=CAP_NET_ADMIN CAP_NET_BIND_SERVICE CAP_NET_BROADCAST CAP_NET_RAW
CapabilityBoundingSet=CAP_NET_ADMIN CAP_NET_BIND_SERVICE CAP_NET_BROADCAST CAP_NET_RAW CAP_BPF CAP_SYS_ADMIN
DeviceAllow=char-* rw
ExecStart=!!{{LIBEXECDIR}}/systemd-networkd
FileDescriptorStoreMax=512
@ -48,7 +48,7 @@ RuntimeDirectory=systemd/netif
RuntimeDirectoryPreserve=yes
SystemCallArchitectures=native
SystemCallErrorNumber=EPERM
SystemCallFilter=@system-service
SystemCallFilter=@system-service bpf
Type=notify-reload
User=systemd-network
{{SERVICE_WATCHDOG}}