1
0
mirror of https://github.com/golang/go synced 2024-09-25 01:30:13 -06:00

runtime: Optimize aeshash a bit. Use a better predicted branch

for checking for page boundary.  Also avoid boundary check
when >=16 bytes are hashed.

benchmark                        old ns/op    new ns/op    delta
BenchmarkHashStringSpeed                23           22   -0.43%
BenchmarkHashBytesSpeed                 44           42   -3.61%
BenchmarkHashStringArraySpeed           71           68   -4.05%

R=iant, khr
CC=gobot, golang-dev, google
https://golang.org/cl/9123046
This commit is contained in:
Keith Randall 2013-05-15 09:40:14 -07:00
parent 23ad563119
commit ee66972dce
3 changed files with 59 additions and 16 deletions

View File

@ -755,31 +755,39 @@ TEXT runtime·aeshashbody(SB),7,$0
PINSRD $1, CX, X0 // size to next 32 bits of xmm0
MOVO runtime·aeskeysched+0(SB), X2
MOVO runtime·aeskeysched+16(SB), X3
CMPL CX, $16
JB aessmall
aesloop:
CMPL CX, $16
JB aesloopend
JBE aesloopend
MOVOU (AX), X1
AESENC X2, X0
AESENC X1, X0
SUBL $16, CX
ADDL $16, AX
JMP aesloop
// 1-16 bytes remaining
aesloopend:
// This load may overlap with the previous load above.
// We'll hash some bytes twice, but that's ok.
MOVOU -16(AX)(CX*1), X1
JMP partial
// 0-15 bytes
aessmall:
TESTL CX, CX
JE finalize // no partial block
JE finalize // 0 bytes
TESTL $16, AX
JNE highpartial
CMPB AX, $0xf0
JA highpartial
// address ends in 0xxxx. 16 bytes loaded
// at this address won't cross a page boundary, so
// we can load it directly.
// 16 bytes loaded at this address won't cross
// a page boundary, so we can load it directly.
MOVOU (AX), X1
ADDL CX, CX
PAND masks(SB)(CX*8), X1
JMP partial
highpartial:
// address ends in 1xxxx. Might be up against
// address ends in 1111xxxx. Might be up against
// a page boundary, so load ending at last byte.
// Then shift bytes down using pshufb.
MOVOU -16(AX)(CX*1), X1

View File

@ -772,31 +772,39 @@ TEXT runtime·aeshashbody(SB),7,$0
PINSRQ $1, CX, X0 // size to high 64 bits of xmm0
MOVO runtime·aeskeysched+0(SB), X2
MOVO runtime·aeskeysched+16(SB), X3
CMPQ CX, $16
JB aessmall
aesloop:
CMPQ CX, $16
JB aesloopend
JBE aesloopend
MOVOU (AX), X1
AESENC X2, X0
AESENC X1, X0
SUBQ $16, CX
ADDQ $16, AX
JMP aesloop
// 1-16 bytes remaining
aesloopend:
// This load may overlap with the previous load above.
// We'll hash some bytes twice, but that's ok.
MOVOU -16(AX)(CX*1), X1
JMP partial
// 0-15 bytes
aessmall:
TESTQ CX, CX
JE finalize // no partial block
JE finalize // 0 bytes
TESTQ $16, AX
JNE highpartial
CMPB AX, $0xf0
JA highpartial
// address ends in 0xxxx. 16 bytes loaded
// at this address won't cross a page boundary, so
// we can load it directly.
// 16 bytes loaded at this address won't cross
// a page boundary, so we can load it directly.
MOVOU (AX), X1
ADDQ CX, CX
PAND masks(SB)(CX*8), X1
JMP partial
highpartial:
// address ends in 1xxxx. Might be up against
// address ends in 1111xxxx. Might be up against
// a page boundary, so load ending at last byte.
// Then shift bytes down using pshufb.
MOVOU -16(AX)(CX*1), X1

View File

@ -32,6 +32,33 @@ func BenchmarkHashStringSpeed(b *testing.B) {
}
}
type chunk [17]byte
func BenchmarkHashBytesSpeed(b *testing.B) {
// a bunch of chunks, each with a different alignment mod 16
var chunks [size]chunk
// initialize each to a different value
for i := 0; i < size; i++ {
chunks[i][0] = byte(i)
}
// put into a map
m := make(map[chunk]int, size)
for i, c := range chunks {
m[c] = i
}
idx := 0
b.ResetTimer()
for i := 0; i < b.N; i++ {
if m[chunks[idx]] != idx {
b.Error("bad map entry for chunk")
}
idx++
if idx == size {
idx = 0
}
}
}
func BenchmarkHashInt32Speed(b *testing.B) {
ints := make([]int32, size)
for i := 0; i < size; i++ {