~ruther/guix-local

10edda5aa5f92e7bfcff4902d4f3c11373d1e725 — Maxim Cournoyer 4 months ago d014454
Reinstate "linux-container: Remove #:lock-mounts? and related code."

This reverts commit e0e64be8de3d220a12612b3a2e4aee428277d865.
3 files changed, 107 insertions(+), 41 deletions(-)

M gnu/build/linux-container.scm
M gnu/system/linux-container.scm
M tests/containers.scm
M gnu/build/linux-container.scm => gnu/build/linux-container.scm +74 -37
@@ 190,7 190,10 @@ for the process."
      (remount-read-only "/"))))

(define* (initialize-user-namespace pid host-uids
                                    #:key (guest-uid 0) (guest-gid 0))
                                    #:key
                                    (host-uid (getuid))
                                    (host-gid (getgid))
                                    (guest-uid 0) (guest-gid 0))
  "Configure the user namespace for PID.  HOST-UIDS specifies the number of
host user identifiers to map into the user namespace.  GUEST-UID and GUEST-GID
specify the first UID (respectively GID) that host UIDs (respectively GIDs)


@@ 201,24 204,21 @@ map to in the namespace."
  (define (scope file)
    (string-append proc-dir file))

  (let ((uid (getuid))
        (gid (getgid)))

    ;; Only root can write to the gid map without first disabling the
    ;; setgroups syscall.
    (unless (and (zero? uid) (zero? gid))
      (call-with-output-file (scope "/setgroups")
        (lambda (port)
          (display "deny" port))))

    ;; Map the user/group that created the container to the root user
    ;; within the container.
    (call-with-output-file (scope "/uid_map")
      (lambda (port)
        (format port "~d ~d ~d" guest-uid uid host-uids)))
    (call-with-output-file (scope "/gid_map")
  ;; Only root can write to the gid map without first disabling the
  ;; setgroups syscall.
  (unless (and (zero? host-uid) (zero? host-gid))
    (call-with-output-file (scope "/setgroups")
      (lambda (port)
        (format port "~d ~d ~d" guest-gid gid host-uids)))))
        (display "deny" port))))

  ;; Map the user/group that created the container to the root user
  ;; within the container.
  (call-with-output-file (scope "/uid_map")
    (lambda (port)
      (format port "~d ~d ~d" guest-uid host-uid host-uids)))
  (call-with-output-file (scope "/gid_map")
    (lambda (port)
      (format port "~d ~d ~d" guest-gid host-gid host-uids))))

(define (namespaces->bit-mask namespaces)
  "Return the number suitable for the 'flags' argument of 'clone' that


@@ 239,12 239,14 @@ corresponds to the symbols in NAMESPACES."
                        #:key (guest-uid 0) (guest-gid 0)
                        (populate-file-system (const #t))
                        (loopback-network? #t)
                        (lock-mounts? #t)
                        writable-root?)
  "Run THUNK in a new container process and return its PID.  ROOT specifies
the root directory for the container.  MOUNTS is a list of <file-system>
objects that specify file systems to mount inside the container.  NAMESPACES
is a list of symbols that correspond to the possible Linux namespaces: mnt,
ipc, uts, user, and net.
ipc, uts, user, and net.  When LOCK-MOUNTS? is true, arrange so that none of
MOUNTS can be unmounted or remounted individually from within THUNK.

When LOOPBACK-NETWORK? is true and 'net is amount NAMESPACES, set up the
loopback device (\"lo\") and a minimal /etc/hosts.


@@ 304,6 306,28 @@ that host UIDs (respectively GIDs) map to in the namespace."
                      ;; cannot be 'read' so they shouldn't be written as is.
                      (write args child)
                      (primitive-exit 3))))

                (when (and lock-mounts?
                           (memq 'mnt namespaces)
                           (memq 'user namespaces))
                  ;; Create a new mount namespace owned by a new user
                  ;; namespace to "lock" together previous mounts, such that
                  ;; they cannot be unmounted or remounted separately--see
                  ;; mount_namespaces(7).
                  ;;
                  ;; Note: at this point, the process is single-threaded (no
                  ;; GC mark threads, no finalization thread, etc.) which is
                  ;; why unshare(CLONE_NEWUSER) can be used.
                  (let ((uid (getuid)) (gid (getgid)))
                    (unshare (logior CLONE_NEWUSER CLONE_NEWNS))
                    (when (file-exists? "/proc/self")
                      (initialize-user-namespace (getpid)
                                                 host-uids
                                                 #:host-uid uid
                                                 #:host-gid gid
                                                 #:guest-uid guest-uid
                                                 #:guest-gid guest-gid))))

                ;; TODO: Manage capabilities.
                (write 'ready child)
                (close-port child)


@@ 376,6 400,7 @@ if there are no child processes left."

(define* (call-with-container mounts thunk #:key (namespaces %namespaces)
                              (host-uids 1) (guest-uid 0) (guest-gid 0)
                              (lock-mounts? #t)
                              (relayed-signals (list SIGINT SIGTERM))
                              (child-is-pid1? #t)
                              (populate-file-system (const #t))


@@ 460,6 485,7 @@ load path must be adjusted as needed."
  (call-with-temporary-directory
   (lambda (root)
     (let ((pid (run-container root mounts namespaces host-uids thunk*
                               #:lock-mounts? lock-mounts?
                               #:guest-uid guest-uid
                               #:guest-gid guest-gid
                               #:populate-file-system populate-file-system


@@ 480,24 506,35 @@ return the exit status, an integer as returned by 'waitpid'."
    (0
     (call-with-clean-exit
      (lambda ()
        (for-each (lambda (ns)
                    (let ((source (namespace-file (getpid) ns))
                          (target (namespace-file pid ns)))
                      ;; Joining the namespace that the process already
                      ;; belongs to would throw an error so avoid that.
                      ;; XXX: This /proc interface leads to TOCTTOU.
                      (unless (string=? (readlink source) (readlink target))
                        (call-with-input-file source
                          (lambda (current-ns-port)
                            (call-with-input-file target
                              (lambda (new-ns-port)
                                (setns (fileno new-ns-port) 0))))))))
                  ;; It's important that the user namespace is joined first,
                  ;; so that the user will have the privileges to join the
                  ;; other namespaces.  Furthermore, it's important that the
                  ;; mount namespace is joined last, otherwise the /proc mount
                  ;; point would no longer be accessible.
                  '("user" "ipc" "uts" "net" "pid" "mnt"))
        ;; First, determine the user namespace that owns the pid namespace and
        ;; join that user namespace (the assumption is that it also owns all
        ;; the other namespaces).  It's important that the user namespace is
        ;; joined first, so that the user will have the privileges to join the
        ;; other namespaces.
        (let* ((pid-ns (open-fdes (namespace-file pid "pid")
                                  (logior O_CLOEXEC O_RDONLY)))
               (user-ns (get-user-ns pid-ns)))
          (close-fdes pid-ns)
          (unless (equal? (stat user-ns)
                          (stat (namespace-file (getpid) "user")))
            (setns user-ns 0))
          (close-fdes user-ns)

          ;; Then join all the remaining namespaces.
          (for-each (lambda (ns)
                      (let ((source (namespace-file (getpid) ns))
                            (target (namespace-file pid ns)))
                        ;; Joining the namespace that the process already
                        ;; belongs to would throw an error so avoid that.
                        ;; XXX: This /proc interface leads to TOCTTOU.
                        (unless (string=? (readlink source) (readlink target))
                          (call-with-input-file target
                            (lambda (new-ns-port)
                              (setns (fileno new-ns-port) 0))))))
                    ;; It's important that the mount namespace is joined last,
                    ;; otherwise the /proc mount point would no longer be
                    ;; accessible.
                    '("ipc" "uts" "net" "pid" "mnt")))
        (purify-environment)
        (chdir "/")


M gnu/system/linux-container.scm => gnu/system/linux-container.scm +4 -0
@@ 317,6 317,10 @@ Run the container with the given options."))
                #:namespaces (if #$shared-network?
                                 (delq 'net %namespaces)
                                 %namespaces)

                ;; XXX: Work around <https://issues.guix.gnu.org/78356>.
                #:lock-mounts? #f

                #:writable-root? #t
                #:process-spawned-hook explain)))))


M tests/containers.scm => tests/containers.scm +29 -4
@@ 1,6 1,6 @@
;;; GNU Guix --- Functional package management for GNU
;;; Copyright © 2015 David Thompson <davet@gnu.org>
;;; Copyright © 2016, 2017, 2019, 2023 Ludovic Courtès <ludo@gnu.org>
;;; Copyright © 2016-2017, 2019, 2023, 2025 Ludovic Courtès <ludo@gnu.org>
;;;
;;; This file is part of GNU Guix.
;;;


@@ 111,6 111,26 @@
     #:namespaces '(user mnt))))

(skip-if-unsupported)
(test-equal "call-with-container, mnt namespace, locked mounts"
  EINVAL
  ;; umount(2) fails with EINVAL when targeting a mount point that is
  ;; "locked".
  (status:exit-val
   (call-with-container (list (file-system
                                (device "none")
                                (mount-point "/testing")
                                (type "tmpfs")
                                (check? #f)))
     (lambda ()
       (primitive-exit (catch 'system-error
                         (lambda ()
                           (umount "/testing")
                           0)
                         (lambda args
                           (system-error-errno args)))))
     #:namespaces '(user mnt))))

(skip-if-unsupported)
(test-equal "call-with-container, mnt namespace, wrong bind mount"
  `(system-error ,ENOENT)
  ;; An exception should be raised; see <http://bugs.gnu.org/23306>.


@@ 169,7 189,8 @@
     #:namespaces '(user mnt))))

(skip-if-unsupported)
(test-assert "container-excursion"
(test-equal "container-excursion"
  0
  (call-with-temporary-directory
   (lambda (root)
     ;; Two pipes: One for the container to signal that the test can begin,


@@ 193,7 214,11 @@
                   (readlink (string-append "/proc/" pid "/ns/" ns)))
                 '("user" "ipc" "uts" "net" "pid" "mnt"))))

        (let* ((pid (run-container root '() %namespaces 1 container))
        (let* ((pid (run-container root '() %namespaces 1 container
                                   ;; Do not lock mounts so the user namespace
                                   ;; appears to be the same seen from inside
                                   ;; and from outside.
                                   #:lock-mounts? #f))
               (container-namespaces (namespaces pid))
               (result
                (begin


@@ 213,7 238,7 @@
          (write 'done end-out)
          (close end-out)
          (waitpid pid)
          (zero? result)))))))
          result))))))

(skip-if-unsupported)
(test-equal "container-excursion, same namespaces"