mirror of
https://github.com/systemd/systemd.git
synced 2024-11-26 19:53:45 +08:00
8aee931e7a
This adds a small, socket-activated Varlink daemon that can delegate UID ranges for user namespaces to clients asking for it. The primary call is AllocateUserRange() where the user passes in an uninitialized userns fd, which is then set up. There are other calls that allow assigning a mount fd to a userns allocated that way, to set up permissions for a cgroup subtree, and to allocate a veth for such a user namespace. Since the UID assignments are supposed to be transitive, i.e. not permanent, care is taken to ensure that users cannot create inodes owned by these UIDs, so that persistancy cannot be acquired. This is implemented via a BPF-LSM module that ensures that any member of a userns allocated that way cannot create files unless the mount it operates on is owned by the userns itself, or is explicitly allowelisted. BPF LSM program with contributions from Alexei Starovoitov.
48 lines
1.4 KiB
SYSTEMD
48 lines
1.4 KiB
SYSTEMD
# SPDX-License-Identifier: LGPL-2.1-or-later
|
|
#
|
|
# This file is part of systemd.
|
|
#
|
|
# systemd is free software; you can redistribute it and/or modify it
|
|
# under the terms of the GNU Lesser General Public License as published by
|
|
# the Free Software Foundation; either version 2.1 of the License, or
|
|
# (at your option) any later version.
|
|
|
|
[Unit]
|
|
Description=Namespace Resource Manager
|
|
Documentation=man:systemd-nsresourced.service(8)
|
|
Requires=systemd-nsresourced.socket
|
|
After=systemd-nsresourced.socket
|
|
Conflicts=shutdown.target
|
|
Before=sysinit.target shutdown.target
|
|
DefaultDependencies=no
|
|
|
|
[Service]
|
|
CapabilityBoundingSet=CAP_DAC_READ_SEARCH CAP_SYS_RESOURCE CAP_BPF CAP_PERFMON CAP_SETGID CAP_SETUID CAP_SYS_ADMIN CAP_CHOWN CAP_FOWNER
|
|
ExecStart={{LIBEXECDIR}}/systemd-nsresourced
|
|
IPAddressDeny=any
|
|
LimitNOFILE={{HIGH_RLIMIT_NOFILE}}
|
|
LockPersonality=yes
|
|
MemoryDenyWriteExecute=yes
|
|
NoNewPrivileges=yes
|
|
PrivateDevices=yes
|
|
ProtectProc=invisible
|
|
ProtectControlGroups=yes
|
|
ProtectHome=yes
|
|
ProtectHostname=yes
|
|
ProtectKernelLogs=yes
|
|
ProtectKernelModules=yes
|
|
ProtectSystem=strict
|
|
RestrictAddressFamilies=AF_UNIX AF_NETLINK
|
|
RestrictRealtime=yes
|
|
RestrictSUIDSGID=yes
|
|
SystemCallArchitectures=native
|
|
SystemCallErrorNumber=EPERM
|
|
SystemCallFilter=@system-service bpf perf_event_open open_by_handle_at
|
|
Type=notify
|
|
NotifyAccess=all
|
|
FileDescriptorStoreMax=4096
|
|
{{SERVICE_WATCHDOG}}
|
|
|
|
[Install]
|
|
Also=systemd-nsresourced.socket
|