runtime: Optimize aeshash a bit. Use a better predicted branch

for checking for page boundary. Also avoid boundary check when >=16 bytes are hashed. benchmark old ns/op new ns/op delta BenchmarkHashStringSpeed 23 22 -0.43% BenchmarkHashBytesSpeed 44 42 -3.61% BenchmarkHashStringArraySpeed 71 68 -4.05% R=iant, khr CC=gobot, golang-dev, google https://golang.org/cl/9123046
2024-09-25 01:30:13 -06:00 · 2013-05-15 09:40:14 -07:00 · 2013-05-15 09:40:14 -07:00 · ee66972dce
commit ee66972dce
parent 23ad563119
3 changed files with 59 additions and 16 deletions
--- a/src/pkg/runtime/asm_386.s
+++ b/src/pkg/runtime/asm_386.s
@ -755,31 +755,39 @@ TEXT runtime·aeshashbody(SB),7,$0
 	PINSRD	$1, CX, X0	// size to next 32 bits of xmm0
 	MOVO	runtime·aeskeysched+0(SB), X2
 	MOVO	runtime·aeskeysched+16(SB), X3
+	CMPL	CX, $16
+	JB	aessmall
 aesloop:
 	CMPL	CX, $16
-	JB	aesloopend
+	JBE	aesloopend
 	MOVOU	(AX), X1
 	AESENC	X2, X0
 	AESENC	X1, X0
 	SUBL	$16, CX
 	ADDL	$16, AX
 	JMP	aesloop
+// 1-16 bytes remaining
 aesloopend:
+	// This load may overlap with the previous load above.
+	// We'll hash some bytes twice, but that's ok.
+	MOVOU	-16(AX)(CX*1), X1
+	JMP	partial
+// 0-15 bytes
+aessmall:
 	TESTL	CX, CX
-	JE	finalize	// no partial block
+	JE	finalize	// 0 bytes

-	TESTL	$16, AX
-	JNE	highpartial
+	CMPB	AX, $0xf0
+	JA	highpartial

-	// address ends in 0xxxx.  16 bytes loaded
-	// at this address won't cross a page boundary, so
-	// we can load it directly.
+	// 16 bytes loaded at this address won't cross
+	// a page boundary, so we can load it directly.
 	MOVOU	(AX), X1
 	ADDL	CX, CX
 	PAND	masks(SB)(CX*8), X1
 	JMP	partial
 highpartial:
-	// address ends in 1xxxx.  Might be up against
+	// address ends in 1111xxxx.  Might be up against
 	// a page boundary, so load ending at last byte.
 	// Then shift bytes down using pshufb.
 	MOVOU	-16(AX)(CX*1), X1
--- a/src/pkg/runtime/asm_amd64.s
+++ b/src/pkg/runtime/asm_amd64.s
@ -772,31 +772,39 @@ TEXT runtime·aeshashbody(SB),7,$0
 	PINSRQ	$1, CX, X0	// size to high 64 bits of xmm0
 	MOVO	runtime·aeskeysched+0(SB), X2
 	MOVO	runtime·aeskeysched+16(SB), X3
+	CMPQ	CX, $16
+	JB	aessmall
 aesloop:
 	CMPQ	CX, $16
-	JB	aesloopend
+	JBE	aesloopend
 	MOVOU	(AX), X1
 	AESENC	X2, X0
 	AESENC	X1, X0
 	SUBQ	$16, CX
 	ADDQ	$16, AX
 	JMP	aesloop
+// 1-16 bytes remaining
 aesloopend:
+	// This load may overlap with the previous load above.
+	// We'll hash some bytes twice, but that's ok.
+	MOVOU	-16(AX)(CX*1), X1
+	JMP	partial
+// 0-15 bytes
+aessmall:
 	TESTQ	CX, CX
-	JE	finalize	// no partial block
+	JE	finalize	// 0 bytes

-	TESTQ	$16, AX
-	JNE	highpartial
+	CMPB	AX, $0xf0
+	JA	highpartial

-	// address ends in 0xxxx.  16 bytes loaded
-	// at this address won't cross a page boundary, so
-	// we can load it directly.
+	// 16 bytes loaded at this address won't cross
+	// a page boundary, so we can load it directly.
 	MOVOU	(AX), X1
 	ADDQ	CX, CX
 	PAND	masks(SB)(CX*8), X1
 	JMP	partial
 highpartial:
-	// address ends in 1xxxx.  Might be up against
+	// address ends in 1111xxxx.  Might be up against
 	// a page boundary, so load ending at last byte.
 	// Then shift bytes down using pshufb.
 	MOVOU	-16(AX)(CX*1), X1
--- a/src/pkg/runtime/mapspeed_test.go
+++ b/src/pkg/runtime/mapspeed_test.go
@ -32,6 +32,33 @@ func BenchmarkHashStringSpeed(b *testing.B) {
 	}
 }

+type chunk [17]byte
+
+func BenchmarkHashBytesSpeed(b *testing.B) {
+	// a bunch of chunks, each with a different alignment mod 16
+	var chunks [size]chunk
+	// initialize each to a different value
+	for i := 0; i < size; i++ {
+		chunks[i][0] = byte(i)
+	}
+	// put into a map
+	m := make(map[chunk]int, size)
+	for i, c := range chunks {
+		m[c] = i
+	}
+	idx := 0
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		if m[chunks[idx]] != idx {
+			b.Error("bad map entry for chunk")
+		}
+		idx++
+		if idx == size {
+			idx = 0
+		}
+	}
+}
+
 func BenchmarkHashInt32Speed(b *testing.B) {
 	ints := make([]int32, size)
 	for i := 0; i < size; i++ {