mirror of
https://github.com/golang/go
synced 2024-11-19 08:54:47 -07:00
runtime: amd64, use 4-byte ops for memmove of 4 bytes
memmove used to use 2 2-byte load/store pairs to move 4 bytes. When the result is loaded with a single 4-byte load, it caused a store to load fowarding stall. To avoid the stall, special case memmove to use 4 byte ops for the 4 byte copy case. We already have a special case for 8-byte copies. 386 already specializes 4-byte copies. I'll do 2-byte copies also, but not for 1.8. benchmark old ns/op new ns/op delta BenchmarkIssue18740-8 7567 4799 -36.58% 3-byte copies get a bit slower. Other copies are unchanged. name old time/op new time/op delta Memmove/3-8 4.76ns ± 5% 5.26ns ± 3% +10.50% (p=0.000 n=10+10) Fixes #18740 Change-Id: Iec82cbac0ecfee80fa3c8fc83828f9a1819c3c74 Reviewed-on: https://go-review.googlesource.com/35567 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
This commit is contained in:
parent
4cce27a3fa
commit
a96e117a58
@ -146,10 +146,16 @@ move_1or2:
|
||||
move_0:
|
||||
RET
|
||||
move_3or4:
|
||||
CMPQ BX, $4
|
||||
JB move_3
|
||||
MOVL (SI), AX
|
||||
MOVL AX, (DI)
|
||||
RET
|
||||
move_3:
|
||||
MOVW (SI), AX
|
||||
MOVW -2(SI)(BX*1), CX
|
||||
MOVB 2(SI), CX
|
||||
MOVW AX, (DI)
|
||||
MOVW CX, -2(DI)(BX*1)
|
||||
MOVB CX, 2(DI)
|
||||
RET
|
||||
move_5through7:
|
||||
MOVL (SI), AX
|
||||
|
@ -6,6 +6,7 @@ package runtime_test
|
||||
|
||||
import (
|
||||
"crypto/rand"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"internal/race"
|
||||
. "runtime"
|
||||
@ -447,3 +448,22 @@ func BenchmarkCopyFat1024(b *testing.B) {
|
||||
_ = y
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkIssue18740(b *testing.B) {
|
||||
// This tests that memmove uses one 4-byte load/store to move 4 bytes.
|
||||
// It used to do 2 2-byte load/stores, which leads to a pipeline stall
|
||||
// when we try to read the result with one 4-byte load.
|
||||
var buf [4]byte
|
||||
for j := 0; j < b.N; j++ {
|
||||
s := uint32(0)
|
||||
for i := 0; i < 4096; i += 4 {
|
||||
copy(buf[:], g[i:])
|
||||
s += binary.LittleEndian.Uint32(buf[:])
|
||||
}
|
||||
sink = uint64(s)
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: 2 byte and 8 byte benchmarks also.
|
||||
|
||||
var g [4096]byte
|
||||
|
Loading…
Reference in New Issue
Block a user