1
0
mirror of https://github.com/golang/go synced 2024-11-17 13:14:56 -07:00

runtime: let the fault thread to crash the process

Let the fault thread to crash the program to make sure while gdb coredump file could see the correct backtrace in the number one thread in gdb.

Fixes #63277.

Change-Id: Ie4473f76f0feba596091433918bcd35a4ff7e11b
GitHub-Last-Rev: f4615c23f6
GitHub-Pull-Request: golang/go#63666
Reviewed-on: https://go-review.googlesource.com/c/go/+/536895
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
zzkcode 2023-11-30 11:39:29 +00:00 committed by Cherry Mui
parent 58bfef82fc
commit de5b418bea
2 changed files with 206 additions and 47 deletions

View File

@ -20,6 +20,43 @@ import (
"testing"
)
func canGenerateCore(t *testing.T) bool {
// Ensure there is enough RLIMIT_CORE available to generate a full core.
var lim syscall.Rlimit
err := syscall.Getrlimit(syscall.RLIMIT_CORE, &lim)
if err != nil {
t.Fatalf("error getting rlimit: %v", err)
}
// Minimum RLIMIT_CORE max to allow. This is a conservative estimate.
// Most systems allow infinity.
const minRlimitCore = 100 << 20 // 100 MB
if lim.Max < minRlimitCore {
t.Skipf("RLIMIT_CORE max too low: %#+v", lim)
}
// Make sure core pattern will send core to the current directory.
b, err := os.ReadFile("/proc/sys/kernel/core_pattern")
if err != nil {
t.Fatalf("error reading core_pattern: %v", err)
}
if string(b) != "core\n" {
t.Skipf("Unexpected core pattern %q", string(b))
}
coreUsesPID := false
b, err = os.ReadFile("/proc/sys/kernel/core_uses_pid")
if err == nil {
switch string(bytes.TrimSpace(b)) {
case "0":
case "1":
coreUsesPID = true
default:
t.Skipf("unexpected core_uses_pid value %q", string(b))
}
}
return coreUsesPID
}
const coreSignalSource = `
package main
@ -81,45 +118,12 @@ func TestGdbCoreSignalBacktrace(t *testing.T) {
t.Parallel()
checkGdbVersion(t)
// Ensure there is enough RLIMIT_CORE available to generate a full core.
var lim syscall.Rlimit
err := syscall.Getrlimit(syscall.RLIMIT_CORE, &lim)
if err != nil {
t.Fatalf("error getting rlimit: %v", err)
}
// Minimum RLIMIT_CORE max to allow. This is a conservative estimate.
// Most systems allow infinity.
const minRlimitCore = 100 << 20 // 100 MB
if lim.Max < minRlimitCore {
t.Skipf("RLIMIT_CORE max too low: %#+v", lim)
}
// Make sure core pattern will send core to the current directory.
b, err := os.ReadFile("/proc/sys/kernel/core_pattern")
if err != nil {
t.Fatalf("error reading core_pattern: %v", err)
}
if string(b) != "core\n" {
t.Skipf("Unexpected core pattern %q", string(b))
}
coreUsesPID := false
b, err = os.ReadFile("/proc/sys/kernel/core_uses_pid")
if err == nil {
switch string(bytes.TrimSpace(b)) {
case "0":
case "1":
coreUsesPID = true
default:
t.Skipf("unexpected core_uses_pid value %q", string(b))
}
}
dir := t.TempDir()
coreUsesPID := canGenerateCore(t)
// Build the source code.
dir := t.TempDir()
src := filepath.Join(dir, "main.go")
err = os.WriteFile(src, []byte(coreSignalSource), 0644)
err := os.WriteFile(src, []byte(coreSignalSource), 0644)
if err != nil {
t.Fatalf("failed to create file: %v", err)
}
@ -230,3 +234,143 @@ func TestGdbCoreSignalBacktrace(t *testing.T) {
t.Fatalf("could not find runtime symbol in backtrace after signal handler:\n%s", rest)
}
}
const coreCrashThreadSource = `
package main
/*
#cgo CFLAGS: -g -O0
#include <stdio.h>
#include <stddef.h>
void trigger_crash()
{
int* ptr = NULL;
*ptr = 1024;
}
*/
import "C"
import (
"flag"
"fmt"
"os"
"runtime/debug"
"syscall"
)
func enableCore() {
debug.SetTraceback("crash")
var lim syscall.Rlimit
err := syscall.Getrlimit(syscall.RLIMIT_CORE, &lim)
if err != nil {
panic(fmt.Sprintf("error getting rlimit: %v", err))
}
lim.Cur = lim.Max
fmt.Fprintf(os.Stderr, "Setting RLIMIT_CORE = %+#v\n", lim)
err = syscall.Setrlimit(syscall.RLIMIT_CORE, &lim)
if err != nil {
panic(fmt.Sprintf("error setting rlimit: %v", err))
}
}
func main() {
flag.Parse()
enableCore()
C.trigger_crash()
}
`
// TestGdbCoreCrashThreadBacktrace tests that runtime could let the fault thread to crash process
// and make fault thread as number one thread while gdb in a core file
func TestGdbCoreCrashThreadBacktrace(t *testing.T) {
if runtime.GOOS != "linux" {
// N.B. This test isn't fundamentally Linux-only, but it needs
// to know how to enable/find core files on each OS.
t.Skip("Test only supported on Linux")
}
if runtime.GOARCH != "386" && runtime.GOARCH != "amd64" {
// TODO(go.dev/issue/25218): Other architectures use sigreturn
// via VDSO, which we somehow don't handle correctly.
t.Skip("Backtrace through signal handler only works on 386 and amd64")
}
checkGdbEnvironment(t)
t.Parallel()
checkGdbVersion(t)
coreUsesPID := canGenerateCore(t)
// Build the source code.
dir := t.TempDir()
src := filepath.Join(dir, "main.go")
err := os.WriteFile(src, []byte(coreCrashThreadSource), 0644)
if err != nil {
t.Fatalf("failed to create file: %v", err)
}
cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", "a.exe", "main.go")
cmd.Dir = dir
out, err := testenv.CleanCmdEnv(cmd).CombinedOutput()
if err != nil {
t.Fatalf("building source %v\n%s", err, out)
}
// Start the test binary.
cmd = testenv.Command(t, "./a.exe")
cmd.Dir = dir
var output bytes.Buffer
cmd.Stdout = &output // for test logging
cmd.Stderr = &output
if err := cmd.Start(); err != nil {
t.Fatalf("error starting test binary: %v", err)
}
pid := cmd.Process.Pid
err = cmd.Wait()
t.Logf("child output:\n%s", output.String())
if err == nil {
t.Fatalf("Wait succeeded, want SIGABRT")
}
ee, ok := err.(*exec.ExitError)
if !ok {
t.Fatalf("Wait err got %T %v, want exec.ExitError", ee, ee)
}
ws, ok := ee.Sys().(syscall.WaitStatus)
if !ok {
t.Fatalf("Sys got %T %v, want syscall.WaitStatus", ee.Sys(), ee.Sys())
}
if ws.Signal() != syscall.SIGABRT {
t.Fatalf("Signal got %d want SIGABRT", ws.Signal())
}
if !ws.CoreDump() {
t.Fatalf("CoreDump got %v want true", ws.CoreDump())
}
coreFile := "core"
if coreUsesPID {
coreFile += fmt.Sprintf(".%d", pid)
}
// Execute gdb commands.
args := []string{"-nx", "-batch",
"-iex", "add-auto-load-safe-path " + filepath.Join(testenv.GOROOT(t), "src", "runtime"),
"-ex", "backtrace",
filepath.Join(dir, "a.exe"),
filepath.Join(dir, coreFile),
}
cmd = testenv.Command(t, "gdb", args...)
got, err := cmd.CombinedOutput()
t.Logf("gdb output:\n%s", got)
if err != nil {
t.Fatalf("gdb exited with error: %v", err)
}
re := regexp.MustCompile(`#.* trigger_crash`)
if found := re.Find(got) != nil; !found {
t.Fatalf("could not find trigger_crash in backtrace")
}
}

View File

@ -597,7 +597,7 @@ func adjustSignalStack(sig uint32, mp *m, gsigStack *gsignalStack) bool {
// crashing is the number of m's we have waited for when implementing
// GOTRACEBACK=crash when a signal is received.
var crashing int32
var crashing atomic.Int32
// testSigtrap and testSigusr1 are used by the runtime tests. If
// non-nil, it is called on SIGTRAP/SIGUSR1. If it returns true, the
@ -730,7 +730,7 @@ func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) {
mp.throwing = throwTypeRuntime
mp.caughtsig.set(gp)
if crashing == 0 {
if crashing.Load() == 0 {
startpanic_m()
}
@ -740,11 +740,11 @@ func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) {
if level > 0 {
goroutineheader(gp)
tracebacktrap(c.sigpc(), c.sigsp(), c.siglr(), gp)
if crashing > 0 && gp != mp.curg && mp.curg != nil && readgstatus(mp.curg)&^_Gscan == _Grunning {
if crashing.Load() > 0 && gp != mp.curg && mp.curg != nil && readgstatus(mp.curg)&^_Gscan == _Grunning {
// tracebackothers on original m skipped this one; trace it now.
goroutineheader(mp.curg)
traceback(^uintptr(0), ^uintptr(0), 0, mp.curg)
} else if crashing == 0 {
} else if crashing.Load() == 0 {
tracebackothers(gp)
print("\n")
}
@ -752,20 +752,35 @@ func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) {
}
if docrash {
crashing++
if crashing < mcount()-int32(extraMLength.Load()) {
isCrashThread := false
if crashing.CompareAndSwap(0, 1) {
isCrashThread = true
} else {
crashing.Add(1)
}
if crashing.Load() < mcount()-int32(extraMLength.Load()) {
// There are other m's that need to dump their stacks.
// Relay SIGQUIT to the next m by sending it to the current process.
// All m's that have already received SIGQUIT have signal masks blocking
// receipt of any signals, so the SIGQUIT will go to an m that hasn't seen it yet.
// When the last m receives the SIGQUIT, it will fall through to the call to
// crash below. Just in case the relaying gets botched, each m involved in
// The first m will wait until all ms received the SIGQUIT, then crash/exit.
// Just in case the relaying gets botched, each m involved in
// the relay sleeps for 5 seconds and then does the crash/exit itself.
// In expected operation, the last m has received the SIGQUIT and run
// crash/exit and the process is gone, all long before any of the
// 5-second sleeps have finished.
// The faulting m is crashing first so it is the faulting thread in the core dump (see issue #63277):
// in expected operation, the first m will wait until the last m has received the SIGQUIT,
// and then run crash/exit and the process is gone.
// However, if it spends more than 5 seconds to send SIGQUIT to all ms,
// any of ms may crash/exit the process after waiting for 5 seconds.
print("\n-----\n\n")
raiseproc(_SIGQUIT)
}
if isCrashThread {
i := 0
for (crashing.Load() < mcount()-int32(extraMLength.Load())) && i < 10 {
i++
usleep(500 * 1000)
}
} else {
usleep(5 * 1000 * 1000)
}
printDebugLog()