diff --git a/src/syscall/exec_linux.go b/src/syscall/exec_linux.go index 79c0d774222..ec8f296bca9 100644 --- a/src/syscall/exec_linux.go +++ b/src/syscall/exec_linux.go @@ -94,6 +94,29 @@ func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr return pid, 0 } +const _LINUX_CAPABILITY_VERSION_3 = 0x20080522 + +type capHeader struct { + version uint32 + pid int32 +} + +type capData struct { + effective uint32 + permitted uint32 + inheritable uint32 +} +type caps struct { + hdr capHeader + data [2]capData +} + +// See CAP_TO_INDEX in linux/capability.h: +func capToIndex(cap uintptr) uintptr { return cap >> 5 } + +// See CAP_TO_MASK in linux/capability.h: +func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) } + // forkAndExecInChild1 implements the body of forkAndExecInChild up to // the parent's post-fork path. This is a separate function so we can // separate the child's and parent's stack frames if we're using @@ -122,6 +145,7 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att err2 Errno nextfd int i int + caps caps ) // Record parent PID so child can test if it has died. @@ -286,11 +310,32 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att } } - for _, c := range sys.AmbientCaps { - _, _, err1 = RawSyscall6(SYS_PRCTL, PR_CAP_AMBIENT, uintptr(PR_CAP_AMBIENT_RAISE), c, 0, 0, 0) - if err1 != 0 { + if len(sys.AmbientCaps) != 0 { + // Ambient capabilities were added in the 4.3 kernel, + // so it is safe to always use _LINUX_CAPABILITY_VERSION_3. + caps.hdr.version = _LINUX_CAPABILITY_VERSION_3 + + if _, _, err1 := RawSyscall(SYS_CAPGET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 { goto childerror } + + for _, c := range sys.AmbientCaps { + // Add the c capability to the permitted and inheritable capability mask, + // otherwise we will not be able to add it to the ambient capability mask. + caps.data[capToIndex(c)].permitted |= capToMask(c) + caps.data[capToIndex(c)].inheritable |= capToMask(c) + } + + if _, _, err1 := RawSyscall(SYS_CAPSET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 { + goto childerror + } + + for _, c := range sys.AmbientCaps { + _, _, err1 = RawSyscall6(SYS_PRCTL, PR_CAP_AMBIENT, uintptr(PR_CAP_AMBIENT_RAISE), c, 0, 0, 0) + if err1 != 0 { + goto childerror + } + } } // Chdir diff --git a/src/syscall/exec_linux_test.go b/src/syscall/exec_linux_test.go index ac5745bc80b..dc16a9d9fe4 100644 --- a/src/syscall/exec_linux_test.go +++ b/src/syscall/exec_linux_test.go @@ -436,7 +436,7 @@ func TestUnshareMountNameSpaceChroot(t *testing.T) { type capHeader struct { version uint32 - pid int + pid int32 } type capData struct { @@ -446,6 +446,7 @@ type capData struct { } const CAP_SYS_TIME = 25 +const CAP_SYSLOG = 34 type caps struct { hdr capHeader @@ -506,15 +507,28 @@ func TestAmbientCapsHelper(*testing.T) { fmt.Fprintln(os.Stderr, "CAP_SYS_TIME unexpectedly not in the effective capability mask") os.Exit(2) } + if caps.data[1].effective&(1<