You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
241 lines
8.9 KiB
241 lines
8.9 KiB
/* |
|
* Small Linux Namespace utility header |
|
* (c) 2019 Taeyeon Mori <taeyeon AT oro.sodimm.me> |
|
*/ |
|
|
|
#pragma once |
|
|
|
#include "koutil.hpp" |
|
#include "kofd.hpp" |
|
#include "koos.hpp" |
|
|
|
#include <sched.h> |
|
#include <sys/syscall.h> |
|
#include <sys/types.h> |
|
#include <unistd.h> |
|
|
|
#include <array> |
|
#include <cstdlib> |
|
#include <fstream> |
|
#include <iostream> |
|
#include <memory> |
|
|
|
|
|
namespace ko::ns { |
|
|
|
/** |
|
* idmap namespace |
|
* Contains functions for setting /proc/pid/[ug]id_map |
|
*/ |
|
namespace idmap { |
|
/// An entry in [ug]id_map |
|
struct entry { |
|
uid_t start; |
|
uid_t host_start; |
|
unsigned long count; |
|
}; |
|
|
|
/// Write the idmap <map> to <path> |
|
template <typename Container> |
|
inline bool set(fs::path path, Container map) { |
|
auto stream = std::ofstream(path); |
|
for (entry &e : map) |
|
stream << e.start << ' ' << e.host_start << ' ' << e.count << '\n'; |
|
stream.close(); |
|
return stream.good(); |
|
} |
|
|
|
/// Disable setgroups() syscall for process <pid> |
|
/// This is required for unprivileged user namespaces |
|
bool disable_setgroups(pid_t pid) { |
|
auto stream = std::ofstream(util::str("/proc/", pid, "/setgroups")); |
|
stream << "deny"; |
|
stream.close(); |
|
return stream.good(); |
|
} |
|
|
|
/// Return an idmap mapping a single ID |
|
inline constexpr std::array<entry, 1> single(uid_t id, uid_t host_id) { |
|
return {{ {id, host_id, 1} }}; |
|
} |
|
|
|
/// Get the path to the <map_type> map for <pid> |
|
/// <map_type> may be "uid" or "pid" |
|
inline fs::path path(pid_t pid, const char *map_type) { |
|
return util::str("/proc/", pid, "/", map_type, "_map"); |
|
} |
|
} |
|
|
|
|
|
/** |
|
* namespace kons::mount |
|
* Stuff related to setting up the mount namespace |
|
*/ |
|
namespace mount { |
|
using os::mount; |
|
using os::bind; |
|
|
|
// Mount all the basic filesystems |
|
util::cvresult mount_core(const fs::path &root) { |
|
return util::cvshort() |
|
.ifthen("mount_root", !os::is_mountpoint(root), bind, root, root, 0) |
|
.ifthen("mount_proc", fs::exists(root / "proc"), mount, "proc", root / "proc", "proc", 0, nullptr) |
|
.ifthen("mount_sys", fs::exists(root / "sys"), bind, "/sys", root / "sys", MS_REC) |
|
.ifthen("mount_dev", fs::exists(root / "dev"), bind, "/dev", root / "dev", MS_REC) |
|
.ifthen("mount_tmp", fs::exists(root / "tmp"), mount, "tmp", root / "tmp", "tmpfs", 0, nullptr) |
|
.ifthen("mount_run", fs::exists(root / "run"), mount, "run", root / "run", "tmpfs", 0, nullptr); |
|
} |
|
|
|
/// Write-protect path to mitigate broken filesystem permissions in single-user ns |
|
util::cvresult protect_path(const fs::path &path) { |
|
return util::cvshort() |
|
.then("bind_protect", bind, path, path, MS_REC) |
|
.then("bind_protect_ro", bind, path, path, MS_REC|MS_REMOUNT|MS_RDONLY); |
|
} |
|
|
|
/// Bind in additional locations required by GUI programs |
|
/// Some of these are serious isolation breaches! |
|
/// Note that home and rundir must be relative and will be interpreted against both '/' and $root |
|
util::cvresult mount_gui(const fs::path &root, const fs::path &home, const fs::path &rundir) { |
|
auto path_from_env = [](const char *name, fs::path dflt, const char *prefix=nullptr) -> fs::path { |
|
auto var = getenv(name); |
|
if (var != nullptr) { |
|
if (prefix != nullptr) { |
|
auto plen = strlen(prefix); |
|
if (!strncmp(var, prefix, plen)) |
|
var += plen; |
|
} |
|
if (var[0] == '/') |
|
return var+1; |
|
} |
|
return dflt; |
|
}; |
|
auto path_from_env_rel = [](fs::path to, const char *name, const char *dflt) -> fs::path { |
|
auto var = getenv(name); |
|
if (var != nullptr) |
|
return to / var; |
|
return to / dflt; |
|
}; |
|
// Bind-mount various paths required to get GUI apps to communicate with system services |
|
// X11, DBus (both buses, Steam does use the system bus), PulseAudio |
|
auto frags = std::array<fs::path, 9>{ |
|
"tmp/.X11-unix", |
|
"run/dbus", |
|
"run/udev", // Udev database for correct ENV entries e.g. ID_INPUT_JOYSTICK markers |
|
//"etc/machine-id", // Pulseaudio will only connect with same machine-id by default. See below |
|
path_from_env("XAUTHORITY", home / ".Xauthority"), |
|
home / ".config/pulse/cookie", |
|
path_from_env("DBUS_SESSION_BUS_ADDRESS", rundir / "bus", "unix:path="), |
|
rundir / "pulse", |
|
rundir / "pipewire-0", |
|
path_from_env_rel(rundir, "WAYLAND_DISPLAY", "wayland-0"), |
|
}; |
|
|
|
// /tmp/.X11-unix must be owned by user or root for wlroots xwayland to work (e.g. gamescope) |
|
// behaviour can be overridden by env var KONS_BIND_X11=all |
|
if (![&frags, root]() { |
|
auto x11_mount_mode = getenv("KONS_BIND_X11"); |
|
if (x11_mount_mode && !strcasecmp("all", x11_mount_mode)) |
|
return true; |
|
auto display = getenv("DISPLAY"); |
|
if (!display) |
|
return false; |
|
if (display[0] == ':') |
|
display += 1; |
|
for (char *c = display; *c; c++) |
|
if (!isdigit(*c)) |
|
return false; |
|
auto dirname = root / frags[0]; |
|
fs::create_directories(dirname); |
|
::chmod(dirname.c_str(), 01777); |
|
auto sockname = frags[0] / util::str("X", display); |
|
fd::touch(root / sockname); |
|
frags[0] = sockname; |
|
return true; |
|
}()) { |
|
std::cerr << "Warn: Invalid $DISPLAY value; falling back to bind-mounting /tmp/.X11-unix whole" << std::endl; |
|
} |
|
|
|
// Pulseaudio will by default only connect to the server published in the X11 root window properties if the machine-ids match. |
|
// Either we bind-mount /etc/machine-id or we need to set PULSE_SERVER in the environment. Both are suboptimal hacks: |
|
// /etc/machine-id shoudn't be the same across two rootfs' but it might be acceptable since we're not running init. |
|
// OTOH, setting PULSE_SERVER can break with nonstandard configurations if they're not manually set in ENV. X11 publish is not enough. |
|
auto pulse = util::str("unix:/", rundir.c_str(), "/pulse/native"); |
|
setenv("PULSE_SERVER", pulse.c_str(), 0); // Don't overwrite, assume that there's a reason it's set. May be TCP. |
|
// If custom unix socket path, it could fail either way as it may not be included above. |
|
// NOTE that exec[vlp]e() must be used to make setenv() work. |
|
|
|
auto sh = util::cvshort(); |
|
auto host_root = fs::path("/"); |
|
for (auto frag : frags) { |
|
auto hpath = host_root / frag; |
|
|
|
if (fs::exists(hpath)) { |
|
auto path = root / frag; |
|
|
|
if (!fs::exists(path)) { |
|
if (fs::is_directory(hpath)) |
|
fs::create_directories(path); |
|
else { |
|
fs::create_directories(path.parent_path()); |
|
auto touch = std::ofstream(path); |
|
} |
|
} |
|
|
|
if (!(sh = sh.then("mount_gui", bind, hpath, path, 0))) |
|
break; |
|
} |
|
} |
|
|
|
return sh; |
|
} |
|
|
|
/// Pivot the root to $new_root, optionally keeping the old one at $old_root. |
|
/// Note that the $old_root directory is required in the process either way. |
|
util::cvresult pivot_root(const fs::path &new_root, const fs::path &old_root, bool keep_old=true) { |
|
auto path = new_root / old_root; |
|
if (!fs::exists(path)) |
|
fs::create_directories(path); |
|
|
|
return util::cvshort() |
|
.then("pivot_root", syscall, SYS_pivot_root, new_root.c_str(), path.c_str()) |
|
.then("chdir_root", chdir, "/") |
|
.ifthen("umount_oldroot", !keep_old, umount2, old_root.c_str(), MNT_DETACH); |
|
} |
|
} // namespace mount |
|
|
|
|
|
/** |
|
* Unshare (at least) new single-user namespace |
|
* @param uid The uid inside the userns |
|
* @param gid The gid inside the userns |
|
* @param flags The unshare(2)/clone(2) flags (CLONE_NEWUSER implied) |
|
* @return Zero on success, -1 + errno on failure |
|
*/ |
|
inline int unshare_single(uid_t uid, uid_t gid, long flags) { |
|
auto euid = geteuid(); |
|
auto egid = getegid(); |
|
auto r = ::unshare(flags | CLONE_NEWUSER); |
|
if (r != 0) |
|
return r; |
|
if (!idmap::set("/proc/self/uid_map", idmap::single(uid, euid))) |
|
return -1; |
|
if (!idmap::disable_setgroups(getpid())) |
|
return -1; |
|
if (!idmap::set("/proc/self/gid_map", idmap::single(gid, egid))) |
|
return -1; |
|
return 0; |
|
} |
|
|
|
|
|
inline int setns(const fs::cpath &path, int type, int dirfd=AT_FDCWD) { |
|
auto fd = ::openat(dirfd, path, O_RDONLY); |
|
if (fd < 0) |
|
return errno; |
|
auto res = ::setns(fd, type); |
|
::close(fd); |
|
return res; |
|
} |
|
|
|
} // namespace ko::ns |
|
|
|
|