mirror of
https://github.com/golang/go
synced 2024-11-26 04:07:59 -07:00
syscall: add CgroupFD support for ForkExec on Linux
Implement CLONE_INTO_CGROUP feature, allowing to put a child in a specified cgroup in a clean and simple way. Note that the feature only works for cgroup v2, and requires Linux kernel 5.7 or newer. Using the feature requires a new syscall, clone3. Currently this is the only reason to use clone3, but the code is structured in a way so that other cases may be easily added in the future. Add a test case. While at it, try to simplify the syscall calling code in forkAndExecInChild1, which became complicated over time because: 1. It was using either rawVforkSyscall or RawSyscall6 depending on whether CLONE_NEWUSER was set. 2. On Linux/s390, the first two arguments to clone(2) system call are swapped (which deserved a mention in Linux ABI hall of shame). It was worked around in rawVforkSyscall on s390, but had to be implemented via a switch/case when using RawSyscall6, making the code less clear. Let's - modify rawVforkSyscall to have two arguments (which is also required for clone3); - remove the arguments workaround from s390 asm, instead implementing arguments swap in the caller (which still looks ugly but at least it's done once and is clearly documented now); - use rawVforkSyscall for all cases (since it is essentially similar to RawSyscall6, except for having less parameters, not returning r2, and saving/restoring the return address before/after syscall on 386 and amd64). Updates #51246. Change-Id: Ifcd418ebead9257177338ffbcccd0bdecb94474e Reviewed-on: https://go-review.googlesource.com/c/go/+/417695 Auto-Submit: Ian Lance Taylor <iant@google.com> Reviewed-by: Michael Knyszek <mknyszek@google.com> Reviewed-by: Ian Lance Taylor <iant@google.com> Run-TryBot: Ian Lance Taylor <iant@google.com> Run-TryBot: Kirill Kolyshkin <kolyshkin@gmail.com> TryBot-Result: Gopher Robot <gobot@golang.org>
This commit is contained in:
parent
f53b2111e4
commit
bca17d16ca
@ -8,6 +8,8 @@ pkg syscall (linux-386), const CLONE_NEWTIME = 128 #51246
|
||||
pkg syscall (linux-386), const CLONE_NEWTIME ideal-int #51246
|
||||
pkg syscall (linux-386), const CLONE_PIDFD = 4096 #51246
|
||||
pkg syscall (linux-386), const CLONE_PIDFD ideal-int #51246
|
||||
pkg syscall (linux-386), type SysProcAttr struct, CgroupFD int #51246
|
||||
pkg syscall (linux-386), type SysProcAttr struct, UseCgroupFD bool #51246
|
||||
pkg syscall (linux-386-cgo), const CLONE_CLEAR_SIGHAND = 4294967296 #51246
|
||||
pkg syscall (linux-386-cgo), const CLONE_CLEAR_SIGHAND ideal-int #51246
|
||||
pkg syscall (linux-386-cgo), const CLONE_INTO_CGROUP = 8589934592 #51246
|
||||
@ -18,6 +20,8 @@ pkg syscall (linux-386-cgo), const CLONE_NEWTIME = 128 #51246
|
||||
pkg syscall (linux-386-cgo), const CLONE_NEWTIME ideal-int #51246
|
||||
pkg syscall (linux-386-cgo), const CLONE_PIDFD = 4096 #51246
|
||||
pkg syscall (linux-386-cgo), const CLONE_PIDFD ideal-int #51246
|
||||
pkg syscall (linux-386-cgo), type SysProcAttr struct, CgroupFD int #51246
|
||||
pkg syscall (linux-386-cgo), type SysProcAttr struct, UseCgroupFD bool #51246
|
||||
pkg syscall (linux-amd64), const CLONE_CLEAR_SIGHAND = 4294967296 #51246
|
||||
pkg syscall (linux-amd64), const CLONE_CLEAR_SIGHAND ideal-int #51246
|
||||
pkg syscall (linux-amd64), const CLONE_INTO_CGROUP = 8589934592 #51246
|
||||
@ -28,6 +32,8 @@ pkg syscall (linux-amd64), const CLONE_NEWTIME = 128 #51246
|
||||
pkg syscall (linux-amd64), const CLONE_NEWTIME ideal-int #51246
|
||||
pkg syscall (linux-amd64), const CLONE_PIDFD = 4096 #51246
|
||||
pkg syscall (linux-amd64), const CLONE_PIDFD ideal-int #51246
|
||||
pkg syscall (linux-amd64), type SysProcAttr struct, CgroupFD int #51246
|
||||
pkg syscall (linux-amd64), type SysProcAttr struct, UseCgroupFD bool #51246
|
||||
pkg syscall (linux-amd64-cgo), const CLONE_CLEAR_SIGHAND = 4294967296 #51246
|
||||
pkg syscall (linux-amd64-cgo), const CLONE_CLEAR_SIGHAND ideal-int #51246
|
||||
pkg syscall (linux-amd64-cgo), const CLONE_INTO_CGROUP = 8589934592 #51246
|
||||
@ -38,6 +44,8 @@ pkg syscall (linux-amd64-cgo), const CLONE_NEWTIME = 128 #51246
|
||||
pkg syscall (linux-amd64-cgo), const CLONE_NEWTIME ideal-int #51246
|
||||
pkg syscall (linux-amd64-cgo), const CLONE_PIDFD = 4096 #51246
|
||||
pkg syscall (linux-amd64-cgo), const CLONE_PIDFD ideal-int #51246
|
||||
pkg syscall (linux-amd64-cgo), type SysProcAttr struct, CgroupFD int #51246
|
||||
pkg syscall (linux-amd64-cgo), type SysProcAttr struct, UseCgroupFD bool #51246
|
||||
pkg syscall (linux-arm), const CLONE_CLEAR_SIGHAND = 4294967296 #51246
|
||||
pkg syscall (linux-arm), const CLONE_CLEAR_SIGHAND ideal-int #51246
|
||||
pkg syscall (linux-arm), const CLONE_INTO_CGROUP = 8589934592 #51246
|
||||
@ -48,6 +56,8 @@ pkg syscall (linux-arm), const CLONE_NEWTIME = 128 #51246
|
||||
pkg syscall (linux-arm), const CLONE_NEWTIME ideal-int #51246
|
||||
pkg syscall (linux-arm), const CLONE_PIDFD = 4096 #51246
|
||||
pkg syscall (linux-arm), const CLONE_PIDFD ideal-int #51246
|
||||
pkg syscall (linux-arm), type SysProcAttr struct, CgroupFD int #51246
|
||||
pkg syscall (linux-arm), type SysProcAttr struct, UseCgroupFD bool #51246
|
||||
pkg syscall (linux-arm-cgo), const CLONE_CLEAR_SIGHAND = 4294967296 #51246
|
||||
pkg syscall (linux-arm-cgo), const CLONE_CLEAR_SIGHAND ideal-int #51246
|
||||
pkg syscall (linux-arm-cgo), const CLONE_INTO_CGROUP = 8589934592 #51246
|
||||
@ -58,3 +68,5 @@ pkg syscall (linux-arm-cgo), const CLONE_NEWTIME = 128 #51246
|
||||
pkg syscall (linux-arm-cgo), const CLONE_NEWTIME ideal-int #51246
|
||||
pkg syscall (linux-arm-cgo), const CLONE_PIDFD = 4096 #51246
|
||||
pkg syscall (linux-arm-cgo), const CLONE_PIDFD ideal-int #51246
|
||||
pkg syscall (linux-arm-cgo), type SysProcAttr struct, CgroupFD int #51246
|
||||
pkg syscall (linux-arm-cgo), type SysProcAttr struct, UseCgroupFD bool #51246
|
||||
|
@ -13,24 +13,24 @@
|
||||
// instead of the glibc-specific "CALL 0x10(GS)".
|
||||
#define INVOKE_SYSCALL INT $0x80
|
||||
|
||||
// func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
|
||||
TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-16
|
||||
// func rawVforkSyscall(trap, a1, a2 uintptr) (r1, err uintptr)
|
||||
TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-20
|
||||
MOVL trap+0(FP), AX // syscall entry
|
||||
MOVL a1+4(FP), BX
|
||||
MOVL $0, CX
|
||||
MOVL a2+8(FP), CX
|
||||
MOVL $0, DX
|
||||
POPL SI // preserve return address
|
||||
INVOKE_SYSCALL
|
||||
PUSHL SI
|
||||
CMPL AX, $0xfffff001
|
||||
JLS ok
|
||||
MOVL $-1, r1+8(FP)
|
||||
MOVL $-1, r1+12(FP)
|
||||
NEGL AX
|
||||
MOVL AX, err+12(FP)
|
||||
MOVL AX, err+16(FP)
|
||||
RET
|
||||
ok:
|
||||
MOVL AX, r1+8(FP)
|
||||
MOVL $0, err+12(FP)
|
||||
MOVL AX, r1+12(FP)
|
||||
MOVL $0, err+16(FP)
|
||||
RET
|
||||
|
||||
// func rawSyscallNoError(trap uintptr, a1, a2, a3 uintptr) (r1, r2 uintptr);
|
||||
|
@ -11,10 +11,10 @@
|
||||
|
||||
#define SYS_gettimeofday 96
|
||||
|
||||
// func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
|
||||
TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-32
|
||||
// func rawVforkSyscall(trap, a1, a2 uintptr) (r1, err uintptr)
|
||||
TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-40
|
||||
MOVQ a1+8(FP), DI
|
||||
MOVQ $0, SI
|
||||
MOVQ a2+16(FP), SI
|
||||
MOVQ $0, DX
|
||||
MOVQ $0, R10
|
||||
MOVQ $0, R8
|
||||
@ -25,13 +25,13 @@ TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-32
|
||||
PUSHQ R12
|
||||
CMPQ AX, $0xfffffffffffff001
|
||||
JLS ok2
|
||||
MOVQ $-1, r1+16(FP)
|
||||
MOVQ $-1, r1+24(FP)
|
||||
NEGQ AX
|
||||
MOVQ AX, err+24(FP)
|
||||
MOVQ AX, err+32(FP)
|
||||
RET
|
||||
ok2:
|
||||
MOVQ AX, r1+16(FP)
|
||||
MOVQ $0, err+24(FP)
|
||||
MOVQ AX, r1+24(FP)
|
||||
MOVQ $0, err+32(FP)
|
||||
RET
|
||||
|
||||
// func rawSyscallNoError(trap, a1, a2, a3 uintptr) (r1, r2 uintptr)
|
||||
|
@ -41,25 +41,25 @@ okseek:
|
||||
BL runtime·exitsyscall(SB)
|
||||
RET
|
||||
|
||||
// func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
|
||||
TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-16
|
||||
// func rawVforkSyscall(trap, a1, a2 uintptr) (r1, err uintptr)
|
||||
TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-20
|
||||
MOVW trap+0(FP), R7 // syscall entry
|
||||
MOVW a1+4(FP), R0
|
||||
MOVW $0, R1
|
||||
MOVW a2+8(FP), R1
|
||||
MOVW $0, R2
|
||||
SWI $0
|
||||
MOVW $0xfffff001, R1
|
||||
CMP R1, R0
|
||||
BLS ok
|
||||
MOVW $-1, R1
|
||||
MOVW R1, r1+8(FP)
|
||||
MOVW R1, r1+12(FP)
|
||||
RSB $0, R0, R0
|
||||
MOVW R0, err+12(FP)
|
||||
MOVW R0, err+16(FP)
|
||||
RET
|
||||
ok:
|
||||
MOVW R0, r1+8(FP)
|
||||
MOVW R0, r1+12(FP)
|
||||
MOVW $0, R0
|
||||
MOVW R0, err+12(FP)
|
||||
MOVW R0, err+16(FP)
|
||||
RET
|
||||
|
||||
// func rawSyscallNoError(trap uintptr, a1, a2, a3 uintptr) (r1, r2 uintptr);
|
||||
|
@ -4,10 +4,10 @@
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
// func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
|
||||
TEXT ·rawVforkSyscall(SB),NOSPLIT,$0-32
|
||||
// func rawVforkSyscall(trap, a1, a2 uintptr) (r1, err uintptr)
|
||||
TEXT ·rawVforkSyscall(SB),NOSPLIT,$0-40
|
||||
MOVD a1+8(FP), R0
|
||||
MOVD $0, R1
|
||||
MOVD a2+16(FP), R1
|
||||
MOVD $0, R2
|
||||
MOVD $0, R3
|
||||
MOVD $0, R4
|
||||
@ -17,13 +17,13 @@ TEXT ·rawVforkSyscall(SB),NOSPLIT,$0-32
|
||||
CMN $4095, R0
|
||||
BCC ok
|
||||
MOVD $-1, R4
|
||||
MOVD R4, r1+16(FP) // r1
|
||||
MOVD R4, r1+24(FP) // r1
|
||||
NEG R0, R0
|
||||
MOVD R0, err+24(FP) // errno
|
||||
MOVD R0, err+32(FP) // errno
|
||||
RET
|
||||
ok:
|
||||
MOVD R0, r1+16(FP) // r1
|
||||
MOVD ZR, err+24(FP) // errno
|
||||
MOVD R0, r1+24(FP) // r1
|
||||
MOVD ZR, err+32(FP) // errno
|
||||
RET
|
||||
|
||||
// func rawSyscallNoError(trap uintptr, a1, a2, a3 uintptr) (r1, r2 uintptr);
|
||||
|
@ -8,10 +8,10 @@
|
||||
// System calls for loong64, Linux
|
||||
//
|
||||
|
||||
// func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
|
||||
TEXT ·rawVforkSyscall(SB),NOSPLIT,$0-32
|
||||
// func rawVforkSyscall(trap, a1, a2 uintptr) (r1, err uintptr)
|
||||
TEXT ·rawVforkSyscall(SB),NOSPLIT,$0-40
|
||||
MOVV a1+8(FP), R4
|
||||
MOVV $0, R5
|
||||
MOVV a2+16(FP), R5
|
||||
MOVV $0, R6
|
||||
MOVV $0, R7
|
||||
MOVV $0, R8
|
||||
@ -21,13 +21,13 @@ TEXT ·rawVforkSyscall(SB),NOSPLIT,$0-32
|
||||
MOVW $-4096, R12
|
||||
BGEU R12, R4, ok
|
||||
MOVV $-1, R12
|
||||
MOVV R12, r1+16(FP) // r1
|
||||
MOVV R12, r1+24(FP) // r1
|
||||
SUBVU R4, R0, R4
|
||||
MOVV R4, err+24(FP) // errno
|
||||
MOVV R4, err+32(FP) // errno
|
||||
RET
|
||||
ok:
|
||||
MOVV R4, r1+16(FP) // r1
|
||||
MOVV R0, err+24(FP) // errno
|
||||
MOVV R4, r1+24(FP) // r1
|
||||
MOVV R0, err+32(FP) // errno
|
||||
RET
|
||||
|
||||
TEXT ·rawSyscallNoError(SB),NOSPLIT,$0-48
|
||||
|
@ -10,10 +10,10 @@
|
||||
// System calls for mips64, Linux
|
||||
//
|
||||
|
||||
// func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
|
||||
TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-32
|
||||
// func rawVforkSyscall(trap, a1, a2 uintptr) (r1, err uintptr)
|
||||
TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-40
|
||||
MOVV a1+8(FP), R4
|
||||
MOVV R0, R5
|
||||
MOVV a2+16(FP), R5
|
||||
MOVV R0, R6
|
||||
MOVV R0, R7
|
||||
MOVV R0, R8
|
||||
@ -22,12 +22,12 @@ TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-32
|
||||
SYSCALL
|
||||
BEQ R7, ok
|
||||
MOVV $-1, R1
|
||||
MOVV R1, r1+16(FP) // r1
|
||||
MOVV R2, err+24(FP) // errno
|
||||
MOVV R1, r1+24(FP) // r1
|
||||
MOVV R2, err+32(FP) // errno
|
||||
RET
|
||||
ok:
|
||||
MOVV R2, r1+16(FP) // r1
|
||||
MOVV R0, err+24(FP) // errno
|
||||
MOVV R2, r1+24(FP) // r1
|
||||
MOVV R0, err+32(FP) // errno
|
||||
RET
|
||||
|
||||
TEXT ·rawSyscallNoError(SB),NOSPLIT,$0-48
|
||||
|
@ -44,21 +44,21 @@ ok9:
|
||||
JAL runtime·exitsyscall(SB)
|
||||
RET
|
||||
|
||||
// func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
|
||||
TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-16
|
||||
// func rawVforkSyscall(trap, a1, a2 uintptr) (r1, err uintptr)
|
||||
TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-20
|
||||
MOVW a1+4(FP), R4
|
||||
MOVW R0, R5
|
||||
MOVW a2+8(FP), R5
|
||||
MOVW R0, R6
|
||||
MOVW trap+0(FP), R2 // syscall entry
|
||||
SYSCALL
|
||||
BEQ R7, ok
|
||||
MOVW $-1, R1
|
||||
MOVW R1, r1+8(FP) // r1
|
||||
MOVW R2, err+12(FP) // errno
|
||||
MOVW R1, r1+12(FP) // r1
|
||||
MOVW R2, err+16(FP) // errno
|
||||
RET
|
||||
ok:
|
||||
MOVW R2, r1+8(FP) // r1
|
||||
MOVW R0, err+12(FP) // errno
|
||||
MOVW R2, r1+12(FP) // r1
|
||||
MOVW R0, err+16(FP) // errno
|
||||
RET
|
||||
|
||||
TEXT ·rawSyscallNoError(SB),NOSPLIT,$20-24
|
||||
|
@ -10,10 +10,10 @@
|
||||
// System calls for ppc64, Linux
|
||||
//
|
||||
|
||||
// func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
|
||||
TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-32
|
||||
// func rawVforkSyscall(trap, a1, a2 uintptr) (r1, err uintptr)
|
||||
TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-40
|
||||
MOVD a1+8(FP), R3
|
||||
MOVD R0, R4
|
||||
MOVD a2+16(FP), R4
|
||||
MOVD R0, R5
|
||||
MOVD R0, R6
|
||||
MOVD R0, R7
|
||||
@ -22,12 +22,12 @@ TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-32
|
||||
SYSCALL R9
|
||||
BVC ok
|
||||
MOVD $-1, R4
|
||||
MOVD R4, r1+16(FP) // r1
|
||||
MOVD R3, err+24(FP) // errno
|
||||
MOVD R4, r1+24(FP) // r1
|
||||
MOVD R3, err+32(FP) // errno
|
||||
RET
|
||||
ok:
|
||||
MOVD R3, r1+16(FP) // r1
|
||||
MOVD R0, err+24(FP) // errno
|
||||
MOVD R3, r1+24(FP) // r1
|
||||
MOVD R0, err+32(FP) // errno
|
||||
RET
|
||||
|
||||
TEXT ·rawSyscallNoError(SB),NOSPLIT,$0-48
|
||||
|
@ -8,10 +8,10 @@
|
||||
// System calls for riscv64, Linux
|
||||
//
|
||||
|
||||
// func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
|
||||
TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-32
|
||||
// func rawVforkSyscall(trap, a1, a2 uintptr) (r1, err uintptr)
|
||||
TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-40
|
||||
MOV a1+8(FP), A0
|
||||
MOV ZERO, A1
|
||||
MOV a2+16(FP), A1
|
||||
MOV ZERO, A2
|
||||
MOV ZERO, A3
|
||||
MOV ZERO, A4
|
||||
@ -20,14 +20,14 @@ TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-32
|
||||
ECALL
|
||||
MOV $-4096, T0
|
||||
BLTU T0, A0, err
|
||||
MOV A0, r1+16(FP) // r1
|
||||
MOV ZERO, err+24(FP) // errno
|
||||
MOV A0, r1+24(FP) // r1
|
||||
MOV ZERO, err+32(FP) // errno
|
||||
RET
|
||||
err:
|
||||
MOV $-1, T0
|
||||
MOV T0, r1+16(FP) // r1
|
||||
MOV T0, r1+24(FP) // r1
|
||||
SUB A0, ZERO, A0
|
||||
MOV A0, err+24(FP) // errno
|
||||
MOV A0, err+32(FP) // errno
|
||||
RET
|
||||
|
||||
TEXT ·rawSyscallNoError(SB),NOSPLIT,$0-48
|
||||
|
@ -8,10 +8,10 @@
|
||||
// System calls for s390x, Linux
|
||||
//
|
||||
|
||||
// func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
|
||||
TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-32
|
||||
MOVD $0, R2
|
||||
MOVD a1+8(FP), R3
|
||||
// func rawVforkSyscall(trap, a1, a2 uintptr) (r1, err uintptr)
|
||||
TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-40
|
||||
MOVD a1+8(FP), R2
|
||||
MOVD a2+16(FP), R3
|
||||
MOVD $0, R4
|
||||
MOVD $0, R5
|
||||
MOVD $0, R6
|
||||
@ -20,13 +20,13 @@ TEXT ·rawVforkSyscall(SB),NOSPLIT|NOFRAME,$0-32
|
||||
SYSCALL
|
||||
MOVD $0xfffffffffffff001, R8
|
||||
CMPUBLT R2, R8, ok2
|
||||
MOVD $-1, r1+16(FP)
|
||||
MOVD $-1, r1+24(FP)
|
||||
NEG R2, R2
|
||||
MOVD R2, err+24(FP) // errno
|
||||
MOVD R2, err+32(FP) // errno
|
||||
RET
|
||||
ok2:
|
||||
MOVD R2, r1+16(FP)
|
||||
MOVD $0, err+24(FP) // errno
|
||||
MOVD R2, r1+24(FP)
|
||||
MOVD $0, err+32(FP) // errno
|
||||
RET
|
||||
|
||||
// func rawSyscallNoError(trap, a1, a2, a3 uintptr) (r1, r2 uintptr)
|
||||
|
@ -99,6 +99,8 @@ type SysProcAttr struct {
|
||||
// users this should be set to false for mappings work.
|
||||
GidMappingsEnableSetgroups bool
|
||||
AmbientCaps []uintptr // Ambient capabilities (Linux only)
|
||||
UseCgroupFD bool // Whether to make use of the CgroupFD field.
|
||||
CgroupFD int // File descriptor of a cgroup to put the new process into.
|
||||
}
|
||||
|
||||
var (
|
||||
@ -176,6 +178,21 @@ func capToIndex(cap uintptr) uintptr { return cap >> 5 }
|
||||
// See CAP_TO_MASK in linux/capability.h:
|
||||
func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) }
|
||||
|
||||
// cloneArgs holds arguments for clone3 Linux syscall.
|
||||
type cloneArgs struct {
|
||||
flags uint64 // Flags bit mask
|
||||
pidFD uint64 // Where to store PID file descriptor (int *)
|
||||
childTID uint64 // Where to store child TID, in child's memory (pid_t *)
|
||||
parentTID uint64 // Where to store child TID, in parent's memory (pid_t *)
|
||||
exitSignal uint64 // Signal to deliver to parent on child termination
|
||||
stack uint64 // Pointer to lowest byte of stack
|
||||
stackSize uint64 // Size of stack
|
||||
tls uint64 // Location of new TLS
|
||||
setTID uint64 // Pointer to a pid_t array (since Linux 5.5)
|
||||
setTIDSize uint64 // Number of elements in set_tid (since Linux 5.5)
|
||||
cgroup uint64 // File descriptor for target cgroup of child (since Linux 5.7)
|
||||
}
|
||||
|
||||
// forkAndExecInChild1 implements the body of forkAndExecInChild up to
|
||||
// the parent's post-fork path. This is a separate function so we can
|
||||
// separate the child's and parent's stack frames if we're using
|
||||
@ -205,9 +222,10 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att
|
||||
nextfd int
|
||||
i int
|
||||
caps caps
|
||||
fd1 uintptr
|
||||
fd1, flags uintptr
|
||||
puid, psetgroups, pgid []byte
|
||||
uidmap, setgroups, gidmap []byte
|
||||
clone3 *cloneArgs
|
||||
)
|
||||
|
||||
if sys.UidMappings != nil {
|
||||
@ -252,17 +270,33 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att
|
||||
}
|
||||
}
|
||||
|
||||
flags = sys.Cloneflags
|
||||
if sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0 {
|
||||
flags |= CLONE_VFORK | CLONE_VM
|
||||
}
|
||||
// Whether to use clone3.
|
||||
if sys.UseCgroupFD {
|
||||
clone3 = &cloneArgs{
|
||||
flags: uint64(flags) | CLONE_INTO_CGROUP,
|
||||
exitSignal: uint64(SIGCHLD),
|
||||
cgroup: uint64(sys.CgroupFD),
|
||||
}
|
||||
}
|
||||
|
||||
// About to call fork.
|
||||
// No more allocation or calls of non-assembly functions.
|
||||
runtime_BeforeFork()
|
||||
locked = true
|
||||
switch {
|
||||
case sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0:
|
||||
r1, err1 = rawVforkSyscall(SYS_CLONE, uintptr(SIGCHLD|CLONE_VFORK|CLONE_VM)|sys.Cloneflags)
|
||||
case runtime.GOARCH == "s390x":
|
||||
r1, _, err1 = RawSyscall6(SYS_CLONE, 0, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0)
|
||||
default:
|
||||
r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0)
|
||||
if clone3 != nil {
|
||||
r1, err1 = rawVforkSyscall(_SYS_clone3, uintptr(unsafe.Pointer(clone3)), unsafe.Sizeof(*clone3))
|
||||
} else {
|
||||
flags |= uintptr(SIGCHLD)
|
||||
if runtime.GOARCH == "s390x" {
|
||||
// On Linux/s390, the first two arguments of clone(2) are swapped.
|
||||
r1, err1 = rawVforkSyscall(SYS_CLONE, 0, flags)
|
||||
} else {
|
||||
r1, err1 = rawVforkSyscall(SYS_CLONE, flags, 0)
|
||||
}
|
||||
}
|
||||
if err1 != 0 || r1 != 0 {
|
||||
// If we're in the parent, we must return immediately
|
||||
|
@ -7,6 +7,7 @@
|
||||
package syscall_test
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"flag"
|
||||
"fmt"
|
||||
"internal/testenv"
|
||||
@ -14,6 +15,7 @@ import (
|
||||
"os"
|
||||
"os/exec"
|
||||
"os/user"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strconv"
|
||||
@ -461,6 +463,96 @@ func TestUnshareUidGidMapping(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func prepareCgroupFD(t *testing.T) (int, string) {
|
||||
t.Helper()
|
||||
|
||||
const O_PATH = 0x200000 // Same for all architectures, but for some reason not defined in syscall for 386||amd64.
|
||||
|
||||
// Requires cgroup v2.
|
||||
const prefix = "/sys/fs/cgroup"
|
||||
selfCg, err := os.ReadFile("/proc/self/cgroup")
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) || os.IsPermission(err) {
|
||||
t.Skip(err)
|
||||
}
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Expect a single line like this:
|
||||
// 0::/user.slice/user-1000.slice/user@1000.service/app.slice/vte-spawn-891992a2-efbb-4f28-aedb-b24f9e706770.scope
|
||||
// Otherwise it's either cgroup v1 or a hybrid hierarchy.
|
||||
if bytes.Count(selfCg, []byte("\n")) > 1 {
|
||||
t.Skip("cgroup v2 not available")
|
||||
}
|
||||
cg := bytes.TrimPrefix(selfCg, []byte("0::"))
|
||||
if len(cg) == len(selfCg) { // No prefix found.
|
||||
t.Skipf("cgroup v2 not available (/proc/self/cgroup contents: %q)", selfCg)
|
||||
}
|
||||
|
||||
// Need clone3 with CLONE_INTO_CGROUP support.
|
||||
_, err = syscall.ForkExec("non-existent binary", nil, &syscall.ProcAttr{
|
||||
Sys: &syscall.SysProcAttr{
|
||||
UseCgroupFD: true,
|
||||
CgroupFD: -1,
|
||||
},
|
||||
})
|
||||
// // EPERM can be returned if clone3 is not enabled by seccomp.
|
||||
if err == syscall.ENOSYS || err == syscall.EPERM {
|
||||
t.Skipf("clone3 with CLONE_INTO_CGROUP not available: %v", err)
|
||||
}
|
||||
|
||||
// Need an ability to create a sub-cgroup.
|
||||
subCgroup, err := os.MkdirTemp(prefix+string(bytes.TrimSpace(cg)), "subcg-")
|
||||
if err != nil {
|
||||
if os.IsPermission(err) {
|
||||
t.Skip(err)
|
||||
}
|
||||
t.Fatal(err)
|
||||
}
|
||||
t.Cleanup(func() { syscall.Rmdir(subCgroup) })
|
||||
|
||||
cgroupFD, err := syscall.Open(subCgroup, O_PATH, 0)
|
||||
if err != nil {
|
||||
t.Fatal(&os.PathError{Op: "open", Path: subCgroup, Err: err})
|
||||
}
|
||||
t.Cleanup(func() { syscall.Close(cgroupFD) })
|
||||
|
||||
return cgroupFD, "/" + path.Base(subCgroup)
|
||||
}
|
||||
|
||||
func TestUseCgroupFD(t *testing.T) {
|
||||
fd, suffix := prepareCgroupFD(t)
|
||||
|
||||
cmd := exec.Command(os.Args[0], "-test.run=TestUseCgroupFDHelper")
|
||||
cmd.Env = append(os.Environ(), "GO_WANT_HELPER_PROCESS=1")
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{
|
||||
UseCgroupFD: true,
|
||||
CgroupFD: fd,
|
||||
}
|
||||
out, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
t.Fatalf("Cmd failed with err %v, output: %s", err, out)
|
||||
}
|
||||
// NB: this wouldn't work with cgroupns.
|
||||
if !bytes.HasSuffix(bytes.TrimSpace(out), []byte(suffix)) {
|
||||
t.Fatalf("got: %q, want: a line that ends with %q", out, suffix)
|
||||
}
|
||||
}
|
||||
|
||||
func TestUseCgroupFDHelper(*testing.T) {
|
||||
if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" {
|
||||
return
|
||||
}
|
||||
defer os.Exit(0)
|
||||
// Read and print own cgroup path.
|
||||
selfCg, err := os.ReadFile("/proc/self/cgroup")
|
||||
if err != nil {
|
||||
fmt.Fprintln(os.Stderr, err)
|
||||
os.Exit(2)
|
||||
}
|
||||
fmt.Print(string(selfCg))
|
||||
}
|
||||
|
||||
type capHeader struct {
|
||||
version uint32
|
||||
pid int32
|
||||
|
@ -94,6 +94,7 @@ func Syscall6(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err Errno)
|
||||
}
|
||||
|
||||
func rawSyscallNoError(trap, a1, a2, a3 uintptr) (r1, r2 uintptr)
|
||||
func rawVforkSyscall(trap, a1, a2 uintptr) (r1 uintptr, err Errno)
|
||||
|
||||
/*
|
||||
* Wrapped
|
||||
|
@ -8,6 +8,7 @@ import "unsafe"
|
||||
|
||||
const (
|
||||
_SYS_setgroups = SYS_SETGROUPS32
|
||||
_SYS_clone3 = 435
|
||||
_SYS_faccessat2 = 439
|
||||
)
|
||||
|
||||
@ -348,5 +349,3 @@ func (msghdr *Msghdr) SetControllen(length int) {
|
||||
func (cmsg *Cmsghdr) SetLen(length int) {
|
||||
cmsg.Len = uint32(length)
|
||||
}
|
||||
|
||||
func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
|
||||
|
@ -6,6 +6,7 @@ package syscall
|
||||
|
||||
const (
|
||||
_SYS_setgroups = SYS_SETGROUPS
|
||||
_SYS_clone3 = 435
|
||||
_SYS_faccessat2 = 439
|
||||
)
|
||||
|
||||
@ -120,5 +121,3 @@ func (msghdr *Msghdr) SetControllen(length int) {
|
||||
func (cmsg *Cmsghdr) SetLen(length int) {
|
||||
cmsg.Len = uint64(length)
|
||||
}
|
||||
|
||||
func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
|
||||
|
@ -8,6 +8,7 @@ import "unsafe"
|
||||
|
||||
const (
|
||||
_SYS_setgroups = SYS_SETGROUPS32
|
||||
_SYS_clone3 = 435
|
||||
_SYS_faccessat2 = 439
|
||||
)
|
||||
|
||||
@ -200,5 +201,3 @@ func (msghdr *Msghdr) SetControllen(length int) {
|
||||
func (cmsg *Cmsghdr) SetLen(length int) {
|
||||
cmsg.Len = uint32(length)
|
||||
}
|
||||
|
||||
func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
|
||||
|
@ -8,6 +8,7 @@ import "unsafe"
|
||||
|
||||
const (
|
||||
_SYS_setgroups = SYS_SETGROUPS
|
||||
_SYS_clone3 = 435
|
||||
_SYS_faccessat2 = 439
|
||||
)
|
||||
|
||||
@ -182,5 +183,3 @@ func Pause() error {
|
||||
_, err := ppoll(nil, 0, nil, nil)
|
||||
return err
|
||||
}
|
||||
|
||||
func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
|
||||
|
@ -8,6 +8,7 @@ import "unsafe"
|
||||
|
||||
const (
|
||||
_SYS_setgroups = SYS_SETGROUPS
|
||||
_SYS_clone3 = 435
|
||||
_SYS_faccessat2 = 439
|
||||
)
|
||||
|
||||
@ -217,5 +218,3 @@ func Pause() error {
|
||||
_, err := ppoll(nil, 0, nil, nil)
|
||||
return err
|
||||
}
|
||||
|
||||
func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
|
||||
|
@ -8,6 +8,7 @@ package syscall
|
||||
|
||||
const (
|
||||
_SYS_setgroups = SYS_SETGROUPS
|
||||
_SYS_clone3 = 5435
|
||||
_SYS_faccessat2 = 5439
|
||||
)
|
||||
|
||||
@ -182,5 +183,3 @@ func (msghdr *Msghdr) SetControllen(length int) {
|
||||
func (cmsg *Cmsghdr) SetLen(length int) {
|
||||
cmsg.Len = uint64(length)
|
||||
}
|
||||
|
||||
func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
|
||||
|
@ -10,6 +10,7 @@ import "unsafe"
|
||||
|
||||
const (
|
||||
_SYS_setgroups = SYS_SETGROUPS
|
||||
_SYS_clone3 = 4435
|
||||
_SYS_faccessat2 = 4439
|
||||
)
|
||||
|
||||
@ -193,5 +194,3 @@ func (msghdr *Msghdr) SetControllen(length int) {
|
||||
func (cmsg *Cmsghdr) SetLen(length int) {
|
||||
cmsg.Len = uint32(length)
|
||||
}
|
||||
|
||||
func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
|
||||
|
@ -8,6 +8,7 @@ package syscall
|
||||
|
||||
const (
|
||||
_SYS_setgroups = SYS_SETGROUPS
|
||||
_SYS_clone3 = 435
|
||||
_SYS_faccessat2 = 439
|
||||
)
|
||||
|
||||
@ -91,8 +92,6 @@ func (cmsg *Cmsghdr) SetLen(length int) {
|
||||
cmsg.Len = uint64(length)
|
||||
}
|
||||
|
||||
func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
|
||||
|
||||
//sys syncFileRange2(fd int, flags int, off int64, n int64) (err error) = SYS_SYNC_FILE_RANGE2
|
||||
|
||||
func SyncFileRange(fd int, off int64, n int64, flags int) error {
|
||||
|
@ -8,6 +8,7 @@ import "unsafe"
|
||||
|
||||
const (
|
||||
_SYS_setgroups = SYS_SETGROUPS
|
||||
_SYS_clone3 = 435
|
||||
_SYS_faccessat2 = 439
|
||||
)
|
||||
|
||||
@ -168,5 +169,3 @@ func Pause() error {
|
||||
_, err := ppoll(nil, 0, nil, nil)
|
||||
return err
|
||||
}
|
||||
|
||||
func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
|
||||
|
@ -8,6 +8,7 @@ import "unsafe"
|
||||
|
||||
const (
|
||||
_SYS_setgroups = SYS_SETGROUPS
|
||||
_SYS_clone3 = 435
|
||||
_SYS_faccessat2 = 439
|
||||
)
|
||||
|
||||
@ -257,5 +258,3 @@ func (msghdr *Msghdr) SetControllen(length int) {
|
||||
func (cmsg *Cmsghdr) SetLen(length int) {
|
||||
cmsg.Len = uint64(length)
|
||||
}
|
||||
|
||||
func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
|
||||
|
Loading…
Reference in New Issue
Block a user