mirror of
https://github.com/golang/go
synced 2024-11-17 12:44:49 -07:00
cmd/go/internal/lockedfile: use a retry loop to suppress EDEADLK on AIX and Solaris
AIX, Solaris, and Illumos all appear to implement fcntl deadlock detection at the granularity of processes. However, we are acquiring and releasing file locks on individual goroutines running concurrently: our locking occurs at a much finer granularity. As a result, these platforms occasionally fail with EDEADLK errors, when they detect locks that would be _misordered_ in a single-threaded program but are safely _unordered_ in a multi-threaded context. To work around the spurious errors, we treat EDEADLK as always spurious, and retry the failing system call with a bounded exponential backoff. This approach may introduce substantial latency since we no longer benefit from kernel-scheduled wakeups in case of collisions, but high-latency operations seem better than spurious failures. Updates #33974 Updates #35618 Fixes #32817 Change-Id: I58b2c6a0f143bce55d6460fd4ddc3db83577ada7 Reviewed-on: https://go-review.googlesource.com/c/go/+/222277 Reviewed-by: Jay Conrod <jayconrod@google.com>
This commit is contained in:
parent
2b8e60d464
commit
c6f678b6ef
@ -13,19 +13,19 @@
|
||||
// or an F_OFD_SETLK command for 'fcntl', that allows for better concurrency and
|
||||
// does not require per-inode bookkeeping in the application.
|
||||
//
|
||||
// TODO(bcmills): If we add a build tag for Illumos (see golang.org/issue/20603)
|
||||
// then Illumos should use F_OFD_SETLK, and the resulting code would be as
|
||||
// simple as filelock_unix.go. We will still need the code in this file for AIX
|
||||
// or as long as Oracle Solaris provides only F_SETLK.
|
||||
// TODO(golang.org/issue/35618): add a syscall.Flock binding for Illumos and
|
||||
// switch it over to use filelock_unix.go.
|
||||
|
||||
package filelock
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"io"
|
||||
"math/rand"
|
||||
"os"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
)
|
||||
|
||||
type lockType int16
|
||||
@ -91,7 +91,67 @@ func lock(f File, lt lockType) (err error) {
|
||||
wait <- f
|
||||
}
|
||||
|
||||
err = setlkw(f.Fd(), lt)
|
||||
// Spurious EDEADLK errors arise on platforms that compute deadlock graphs at
|
||||
// the process, rather than thread, level. Consider processes P and Q, with
|
||||
// threads P.1, P.2, and Q.3. The following trace is NOT a deadlock, but will be
|
||||
// reported as a deadlock on systems that consider only process granularity:
|
||||
//
|
||||
// P.1 locks file A.
|
||||
// Q.3 locks file B.
|
||||
// Q.3 blocks on file A.
|
||||
// P.2 blocks on file B. (This is erroneously reported as a deadlock.)
|
||||
// P.1 unlocks file A.
|
||||
// Q.3 unblocks and locks file A.
|
||||
// Q.3 unlocks files A and B.
|
||||
// P.2 unblocks and locks file B.
|
||||
// P.2 unlocks file B.
|
||||
//
|
||||
// These spurious errors were observed in practice on AIX and Solaris in
|
||||
// cmd/go: see https://golang.org/issue/32817.
|
||||
//
|
||||
// We work around this bug by treating EDEADLK as always spurious. If there
|
||||
// really is a lock-ordering bug between the interacting processes, it will
|
||||
// become a livelock instead, but that's not appreciably worse than if we had
|
||||
// a proper flock implementation (which generally does not even attempt to
|
||||
// diagnose deadlocks).
|
||||
//
|
||||
// In the above example, that changes the trace to:
|
||||
//
|
||||
// P.1 locks file A.
|
||||
// Q.3 locks file B.
|
||||
// Q.3 blocks on file A.
|
||||
// P.2 spuriously fails to lock file B and goes to sleep.
|
||||
// P.1 unlocks file A.
|
||||
// Q.3 unblocks and locks file A.
|
||||
// Q.3 unlocks files A and B.
|
||||
// P.2 wakes up and locks file B.
|
||||
// P.2 unlocks file B.
|
||||
//
|
||||
// We know that the retry loop will not introduce a *spurious* livelock
|
||||
// because, according to the POSIX specification, EDEADLK is only to be
|
||||
// returned when “the lock is blocked by a lock from another process”.
|
||||
// If that process is blocked on some lock that we are holding, then the
|
||||
// resulting livelock is due to a real deadlock (and would manifest as such
|
||||
// when using, for example, the flock implementation of this package).
|
||||
// If the other process is *not* blocked on some other lock that we are
|
||||
// holding, then it will eventually release the requested lock.
|
||||
|
||||
nextSleep := 1 * time.Millisecond
|
||||
const maxSleep = 500 * time.Millisecond
|
||||
for {
|
||||
err = setlkw(f.Fd(), lt)
|
||||
if err != syscall.EDEADLK {
|
||||
break
|
||||
}
|
||||
time.Sleep(nextSleep)
|
||||
|
||||
nextSleep += nextSleep
|
||||
if nextSleep > maxSleep {
|
||||
nextSleep = maxSleep
|
||||
}
|
||||
// Apply 10% jitter to avoid synchronizing collisions when we finally unblock.
|
||||
nextSleep += time.Duration((0.1*rand.Float64() - 0.05) * float64(nextSleep))
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
unlock(f)
|
||||
|
@ -8,8 +8,11 @@
|
||||
package lockedfile_test
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"internal/testenv"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
@ -172,3 +175,98 @@ func TestCanLockExistingFile(t *testing.T) {
|
||||
f.Close()
|
||||
wait(t)
|
||||
}
|
||||
|
||||
// TestSpuriousEDEADLK verifies that the spurious EDEADLK reported in
|
||||
// https://golang.org/issue/32817 no longer occurs.
|
||||
func TestSpuriousEDEADLK(t *testing.T) {
|
||||
// P.1 locks file A.
|
||||
// Q.3 locks file B.
|
||||
// Q.3 blocks on file A.
|
||||
// P.2 blocks on file B. (Spurious EDEADLK occurs here.)
|
||||
// P.1 unlocks file A.
|
||||
// Q.3 unblocks and locks file A.
|
||||
// Q.3 unlocks files A and B.
|
||||
// P.2 unblocks and locks file B.
|
||||
// P.2 unlocks file B.
|
||||
|
||||
testenv.MustHaveExec(t)
|
||||
|
||||
dirVar := t.Name() + "DIR"
|
||||
|
||||
if dir := os.Getenv(dirVar); dir != "" {
|
||||
// Q.3 locks file B.
|
||||
b, err := lockedfile.Edit(filepath.Join(dir, "B"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer b.Close()
|
||||
|
||||
if err := ioutil.WriteFile(filepath.Join(dir, "locked"), []byte("ok"), 0666); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Q.3 blocks on file A.
|
||||
a, err := lockedfile.Edit(filepath.Join(dir, "A"))
|
||||
// Q.3 unblocks and locks file A.
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer a.Close()
|
||||
|
||||
// Q.3 unlocks files A and B.
|
||||
return
|
||||
}
|
||||
|
||||
dir, remove := mustTempDir(t)
|
||||
defer remove()
|
||||
|
||||
// P.1 locks file A.
|
||||
a, err := lockedfile.Edit(filepath.Join(dir, "A"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
cmd := exec.Command(os.Args[0], "-test.run="+t.Name())
|
||||
cmd.Env = append(os.Environ(), fmt.Sprintf("%s=%s", dirVar, dir))
|
||||
|
||||
qDone := make(chan struct{})
|
||||
waitQ := mustBlock(t, "Edit A and B in subprocess", func() {
|
||||
out, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
t.Errorf("%v:\n%s", err, out)
|
||||
}
|
||||
close(qDone)
|
||||
})
|
||||
|
||||
// Wait until process Q has either failed or locked file B.
|
||||
// Otherwise, P.2 might not block on file B as intended.
|
||||
locked:
|
||||
for {
|
||||
if _, err := os.Stat(filepath.Join(dir, "locked")); !os.IsNotExist(err) {
|
||||
break locked
|
||||
}
|
||||
select {
|
||||
case <-qDone:
|
||||
break locked
|
||||
case <-time.After(1 * time.Millisecond):
|
||||
}
|
||||
}
|
||||
|
||||
waitP2 := mustBlock(t, "Edit B", func() {
|
||||
// P.2 blocks on file B. (Spurious EDEADLK occurs here.)
|
||||
b, err := lockedfile.Edit(filepath.Join(dir, "B"))
|
||||
// P.2 unblocks and locks file B.
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
return
|
||||
}
|
||||
// P.2 unlocks file B.
|
||||
b.Close()
|
||||
})
|
||||
|
||||
// P.1 unlocks file A.
|
||||
a.Close()
|
||||
|
||||
waitQ(t)
|
||||
waitP2(t)
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user