From 67e537541c043c701001f002bed0cda70ce72767 Mon Sep 17 00:00:00 2001 From: Austin Clements Date: Tue, 20 Jun 2017 15:12:12 -0400 Subject: [PATCH] syscall: use CLONE_VFORK safely Currently, CLONE_VFORK is used without much regard to the stack. This is dangerous, because anything the child does to the stack is visible to the parent. For example, if the compiler were to reuse named stack slots (which it currently doesn't do), it would be easy for the child running in the same stack frame as the parent to corrupt local variables that the parent then depended on. We're not sure of anything specific going wrong in this code right now, but it is at best a ticking time bomb. CLONE_VFORK can only safely be used if we ensure the child does not execute in any of the active stack frames of the parent. This commit implements this by arranging for the parent to return immediately from the frame the child will operate in, and for the child to never return to the frame the parent will operate in. Fixes #20732. Change-Id: Iad5b4ddc2b994c082bd278bfd52ef53bd38c037f Reviewed-on: https://go-review.googlesource.com/46173 Run-TryBot: Austin Clements TryBot-Result: Gobot Gobot Reviewed-by: Ian Lance Taylor --- src/syscall/exec_linux.go | 70 +++++++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 25 deletions(-) diff --git a/src/syscall/exec_linux.go b/src/syscall/exec_linux.go index e631cd6470..db52360717 100644 --- a/src/syscall/exec_linux.go +++ b/src/syscall/exec_linux.go @@ -63,15 +63,46 @@ func runtime_AfterForkInChild() // functions that do not grow the stack. //go:norace func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) { + // Set up and fork. This returns immediately in the parent or + // if there's an error. + r1, err1, p, locked := forkAndExecInChild1(argv0, argv, envv, chroot, dir, attr, sys, pipe) + if locked { + runtime_AfterFork() + } + if err1 != 0 { + return 0, err1 + } + + // parent; return PID + pid = int(r1) + + if sys.UidMappings != nil || sys.GidMappings != nil { + Close(p[0]) + err := writeUidGidMappings(pid, sys) + var err2 Errno + if err != nil { + err2 = err.(Errno) + } + RawSyscall(SYS_WRITE, uintptr(p[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2)) + Close(p[1]) + } + + return pid, 0 +} + +//go:norace +func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (r1 uintptr, err1 Errno, p [2]int, locked bool) { + // vfork requires that the child not touch any of the parent's + // active stack frames. Hence, the child does all post-fork + // processing in this stack frame and never returns, while the + // parent returns immediately from this frame and does all + // post-fork processing in the outer frame. // Declare all variables at top in case any // declarations require heap allocation (e.g., err1). var ( - r1 uintptr - err1 Errno err2 Errno nextfd int i int - p [2]int ) // Record parent PID so child can test if it has died. @@ -94,13 +125,15 @@ func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr // synchronizing writing of User ID/Group ID mappings. if sys.UidMappings != nil || sys.GidMappings != nil { if err := forkExecPipe(p[:]); err != nil { - return 0, err.(Errno) + err1 = err.(Errno) + return } } // About to call fork. // No more allocation or calls of non-assembly functions. runtime_BeforeFork() + locked = true switch { case runtime.GOARCH == "amd64" && sys.Cloneflags&CLONE_NEWUSER == 0: r1, err1 = rawVforkSyscall(SYS_CLONE, uintptr(SIGCHLD|CLONE_VFORK|CLONE_VM)|sys.Cloneflags) @@ -109,27 +142,14 @@ func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr default: r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0) } - if err1 != 0 { - runtime_AfterFork() - return 0, err1 - } - - if r1 != 0 { - // parent; return PID - runtime_AfterFork() - pid = int(r1) - - if sys.UidMappings != nil || sys.GidMappings != nil { - Close(p[0]) - err := writeUidGidMappings(pid, sys) - if err != nil { - err2 = err.(Errno) - } - RawSyscall(SYS_WRITE, uintptr(p[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2)) - Close(p[1]) - } - - return pid, 0 + if err1 != 0 || r1 != 0 { + // If we're in the parent, we must return immediately + // so we're not in the same stack frame as the child. + // This can at most use the return PC, which the child + // will not modify, and the results of + // rawVforkSyscall, which must have been written after + // the child was replaced. + return } // Fork succeeded, now in child.