M gnu/build/linux-container.scm => gnu/build/linux-container.scm +87 -94
@@ 263,100 263,93 @@ that host UIDs (respectively GIDs) map to in the namespace."
;; child process blocks until the parent writes to it.
(match (socketpair PF_UNIX (logior SOCK_CLOEXEC SOCK_STREAM) 0)
((child . parent)
- (let ((flags (namespaces->bit-mask namespaces)))
- (match (clone flags)
- (0
- ;; Inhibit thread creation until after the unshare call.
- (gc-disable)
- (call-with-clean-exit
- (lambda ()
- (close-port parent)
- ;; Wait for parent to set things up.
- (match (read child)
- ('ready
- (purify-environment)
- (when (and (memq 'mnt namespaces)
- (not (string=? root "/")))
- (catch #t
- (lambda ()
- (mount-file-systems root mounts
- #:mount-/proc? (memq 'pid namespaces)
- #:mount-/sys? (memq 'net
- namespaces)
- #:populate-file-system
- (lambda ()
- (populate-file-system)
- (when (and (memq 'net namespaces)
- loopback-network?)
- (set-network-interface-up "lo")
-
- ;; When isolated from the
- ;; network, provide a minimal
- ;; /etc/hosts to resolve
- ;; "localhost".
- (mkdir-p "/etc")
- (call-with-output-file "/etc/hosts"
- (lambda (port)
- (display "127.0.0.1 localhost\n" port)
- (chmod port #o444)))))
- #:writable-root?
- (or writable-root?
- (not (memq 'mnt namespaces)))))
- (lambda args
- ;; Forward the exception to the parent process.
- ;; FIXME: SRFI-35 conditions and non-trivial objects
- ;; cannot be 'read' so they shouldn't be written as is.
- (write args child)
- (primitive-exit 3))))
-
- (when (and lock-mounts?
- (memq 'mnt namespaces)
- (memq 'user namespaces))
- ;; Create a new mount namespace owned by a new user
- ;; namespace to "lock" together previous mounts, such that
- ;; they cannot be unmounted or remounted separately--see
- ;; mount_namespaces(7).
- ;;
- ;; Note: at this point, the process is single-threaded (no
- ;; GC mark threads, no finalization thread, etc.) which is
- ;; why unshare(CLONE_NEWUSER) can be used.
- (let ((uid (getuid)) (gid (getgid)))
- (unshare (logior CLONE_NEWUSER CLONE_NEWNS))
- (gc-enable)
- (when (file-exists? "/proc/self")
- (initialize-user-namespace (getpid)
- host-uids
- #:host-uid uid
- #:host-gid gid
- #:guest-uid guest-uid
- #:guest-gid guest-gid))))
-
- ;; TODO: Manage capabilities.
- (write 'ready child)
- (close-port child)
- (thunk))
- (_ ;parent died or something
- (primitive-exit 2))))))
- (pid
- (close-port child)
- (when (memq 'user namespaces)
- (initialize-user-namespace pid host-uids
- #:guest-uid guest-uid
- #:guest-gid guest-gid))
- ;; TODO: Initialize cgroups.
- (write 'ready parent)
- (newline parent)
-
- ;; Check whether the child process' setup phase succeeded.
- (let ((message (read parent)))
- (close-port parent)
- (match message
- ('ready ;success
- pid)
- (((? symbol? key) args ...) ;exception
- (apply throw key args))
- (_ ;unexpected termination
- #f)))))))))
+ (safe-clone
+ (namespaces->bit-mask namespaces)
+ (lambda ()
+ (call-with-clean-exit
+ (lambda ()
+ (close-port parent)
+ ;; Wait for parent to set things up.
+ (match (read child)
+ ('ready
+ (purify-environment)
+ (when (and (memq 'mnt namespaces)
+ (not (string=? root "/")))
+ (catch #t
+ (lambda ()
+ (mount-file-systems root mounts
+ #:mount-/proc? (memq 'pid namespaces)
+ #:mount-/sys? (memq 'net
+ namespaces)
+ #:populate-file-system
+ (lambda ()
+ (populate-file-system)
+ (when (and (memq 'net namespaces)
+ loopback-network?)
+ (set-network-interface-up "lo")
+
+ ;; When isolated from the
+ ;; network, provide a minimal
+ ;; /etc/hosts to resolve
+ ;; "localhost".
+ (mkdir-p "/etc")
+ (call-with-output-file "/etc/hosts"
+ (lambda (port)
+ (display "127.0.0.1 localhost\n" port)
+ (chmod port #o444)))))
+ #:writable-root?
+ (or writable-root?
+ (not (memq 'mnt namespaces)))))
+ (lambda args
+ ;; Forward the exception to the parent process.
+ ;; FIXME: SRFI-35 conditions and non-trivial objects
+ ;; cannot be 'read' so they shouldn't be written as is.
+ (write args child)
+ (primitive-exit 3))))
+
+ (when (and lock-mounts?
+ (memq 'mnt namespaces)
+ (memq 'user namespaces))
+ ;; Create a new mount namespace owned by a new user
+ ;; namespace to "lock" together previous mounts, such that
+ ;; they cannot be unmounted or remounted separately--see
+ ;; mount_namespaces(7).
+ (let ((uid (getuid)) (gid (getgid)))
+ (unshare (logior CLONE_NEWUSER CLONE_NEWNS))
+ (when (file-exists? "/proc/self")
+ (initialize-user-namespace (getpid)
+ host-uids
+ #:host-uid uid
+ #:host-gid gid
+ #:guest-uid guest-uid
+ #:guest-gid guest-gid))))
+
+ ;; TODO: Manage capabilities.
+ (write 'ready child)
+ (close-port child)
+ (thunk))
+ (_ ;parent died or something
+ (primitive-exit 2))))))
+ (lambda (pid)
+ (close-port child)
+ (when (memq 'user namespaces)
+ (initialize-user-namespace pid host-uids
+ #:guest-uid guest-uid
+ #:guest-gid guest-gid))
+ ;; TODO: Initialize cgroups.
+ (write 'ready parent)
+ (newline parent)
+
+ ;; Check whether the child process' setup phase succeeded.
+ (let ((message (read parent)))
+ (close-port parent)
+ (match message
+ ('ready ;success
+ pid)
+ (((? symbol? key) args ...) ;exception
+ (apply throw key args))
+ (_ ;unexpected termination
+ #f))))))))
;; FIXME: This is copied from (guix utils), which we cannot use because it
;; would pull (guix config) and all.
M guix/build/syscalls.scm => guix/build/syscalls.scm +50 -3
@@ 150,6 150,7 @@
CLONE_THREAD
CLONE_VM
clone
+ safe-clone
unshare
setns
get-user-ns
@@ 1170,17 1171,45 @@ caller lacks root privileges."
Turning finalization off shuts down the finalization thread as a side effect."
(->bool ((force proc) (if enabled? 1 0))))))
-(define-syntax-rule (without-automatic-finalization exp)
- "Turn off automatic finalization within the dynamic extent of EXP."
+(define-syntax-rule (without-automatic-finalization body ...)
+ "Turn off automatic finalization within the dynamic extent of BODY. This is
+useful to ensure there is no finalization thread."
(let ((enabled? #t))
(dynamic-wind
(lambda ()
(set! enabled? (%set-automatic-finalization-enabled?! #f)))
(lambda ()
- exp)
+ body ...)
(lambda ()
(%set-automatic-finalization-enabled?! enabled?)))))
+(define-syntax-rule (without-garbage-collection body ...)
+ "Turn off garbage collection within the dynamic extent of BODY. This is useful
+to avoid the creation new garbage collection thread. Note that pre-existing
+GC marker threads are only disabled, not terminated."
+ (dynamic-wind
+ (lambda ()
+ (gc-disable))
+ (lambda ()
+ body ...)
+ (lambda ()
+ (gc-enable))))
+
+(define-syntax-rule (without-threads body ...)
+ "Ensure the Guile finalizer thread is stopped and that garbage collection does
+not run. Note that pre-existing GC marker threads are only disabled, not
+terminated. This also leaves the signal handling thread to be disabled by
+another means, since there is no Guile API to do so."
+ ;; Note: the three kind of threads that Guile can spawn are the finalization
+ ;; thread, the signal thread, or the GC marker threads.
+ (without-automatic-finalization
+ (without-garbage-collection body ...)))
+
+(define (ensure-signal-delivery-thread)
+ "Ensure the signal delivery thread is spawned and its state set
+ to 'RUNNING'. This is valid as of the implementation as of Guile 3.0.9."
+ (sigaction SIGUSR1)) ;could be any signal
+
;; The libc interface to sys_clone is not useful for Scheme programs, so the
;; low-level system call is wrapped instead. The 'syscall' function is
;; declared in <unistd.h> as a variadic function; in practice, it expects 6
@@ 1223,6 1252,24 @@ are shared between the parent and child processes."
(list err))
ret)))))
+(define (safe-clone flags child parent)
+ "This is a raw clone syscall wrapper that ensures no Guile thread will be
+spawned during execution of the child. `clone' is called with FLAGS. CHILD
+is a thunk to run in the child process. PARENT is procedure that accepts the
+child PID as argument. This is useful in many contexts, such as when calling
+`unshare' or async-unsafe procedures in the child when the parent process
+memory (CLONE_VM) or threads (CLONE_THREAD) are shared with it."
+ ;; TODO: Contribute `clone' to Guile, and handle these complications there,
+ ;; similarly to how it's handled for scm_fork in posix.c.
+
+ ;; XXX: This is a hack: as of Guile 3.0.9, by starting the signal delivery
+ ;; thread in the parent, its state will be known as RUNNING, and the child
+ ;; won't attempt to start it itself.
+ (ensure-signal-delivery-thread)
+ (match (clone flags)
+ (0 (without-threads (child)))
+ (pid (parent pid))))
+
(define (thread-count)
"Return the complete thread count of the current process. Unlike
`all-threads', this also counts the Guile signal delivery, and finalizer
M tests/syscalls.scm => tests/syscalls.scm +35 -1
@@ 3,6 3,7 @@
;;; Copyright © 2015 David Thompson <davet@gnu.org>
;;; Copyright © 2020 Simon South <simon@simonsouth.net>
;;; Copyright © 2020 Mathieu Othacehe <m.othacehe@gmail.com>
+;;; Copyright © 2025 Maxim Cournoyer <maxim@guixotic.coop>
;;;
;;; This file is part of GNU Guix.
;;;
@@ 29,7 30,8 @@
#:use-module (srfi srfi-71)
#:use-module (system foreign)
#:use-module ((ice-9 ftw) #:select (scandir))
- #:use-module (ice-9 match))
+ #:use-module (ice-9 match)
+ #:use-module (ice-9 threads))
;; Test the (guix build syscalls) module, although there's not much that can
;; actually be tested without being root.
@@ 158,6 160,38 @@
(lambda args
(system-error-errno args))))
+(define child-thunk
+ (lambda ()
+ (gc) ;spawn GC threads
+ (primitive-exit
+ (catch 'system-error
+ (lambda ()
+ (unshare CLONE_THREAD)
+ 0) ;no error
+ (lambda args
+ (system-error-errno args))))))
+
+(define parent-proc
+ (lambda (pid)
+ (match (waitpid pid)
+ ((_ . status)
+ (status:exit-val status)))))
+
+(unless perform-container-tests?
+ (test-skip 1))
+(test-equal "clone and unshare triggers EINVAL"
+ EINVAL
+ (match (clone (logior CLONE_NEWUSER SIGCHLD))
+ (0 (child-thunk))
+ (pid (parent-proc pid))))
+
+(unless perform-container-tests?
+ (test-skip 1))
+(test-equal "safe-clone and unshare succeeds"
+ 0
+ (safe-clone (logior CLONE_NEWUSER SIGCHLD)
+ child-thunk parent-proc))
+
(unless perform-container-tests?
(test-skip 1))
(test-assert "setns"