math/big: Replace RCLQ + ANDQ with SETCS in unrolled arithmetic assembly.

benchmark old ns/op new ns/op delta BenchmarkAddVW_1 8 8 +0.60% BenchmarkAddVW_2 10 9 -8.64% BenchmarkAddVW_3 10 10 -4.63% BenchmarkAddVW_4 10 11 +3.67% BenchmarkAddVW_5 11 12 +5.98% BenchmarkAddVW_1e1 18 20 +6.38% BenchmarkAddVW_1e2 129 115 -10.85% BenchmarkAddVW_1e3 1270 1089 -14.25% BenchmarkAddVW_1e4 13376 12145 -9.20% BenchmarkAddVW_1e5 130392 125260 -3.94% benchmark old MB/s new MB/s speedup BenchmarkAddVW_1 7709.10 7661.92 0.99x BenchmarkAddVW_2 12451.10 13604.00 1.09x BenchmarkAddVW_3 17727.81 18721.54 1.06x BenchmarkAddVW_4 23552.64 22708.81 0.96x BenchmarkAddVW_5 27411.40 25816.22 0.94x BenchmarkAddVW_1e1 34063.19 32023.06 0.94x BenchmarkAddVW_1e2 49529.97 55360.55 1.12x BenchmarkAddVW_1e3 50380.44 58764.18 1.17x BenchmarkAddVW_1e4 47843.59 52696.10 1.10x BenchmarkAddVW_1e5 49082.60 51093.66 1.04x R=gri, rsc, r CC=golang-dev https://golang.org/cl/6480063
2024-11-21 21:34:40 -07:00 · 2012-08-28 09:29:45 -07:00 · 2012-08-28 09:29:45 -07:00 · baf426f10f
commit baf426f10f
parent f653dfeb49
1 changed files with 17 additions and 7 deletions
--- a/src/pkg/math/big/arith_amd64.s
+++ b/src/pkg/math/big/arith_amd64.s
@ -5,6 +5,16 @@
 // This file provides fast assembly versions for the elementary
 // arithmetic operations on vectors implemented in arith.go.

+// Literal instruction for MOVQ $0, CX.
+// (MOVQ $0, reg is translated to XORQ reg, reg and clears CF.)
+#define ZERO_CX BYTE $0x48; \
+		BYTE $0xc7; \
+		BYTE $0xc1; \
+		BYTE $0x00; \
+		BYTE $0x00; \
+		BYTE $0x00; \
+		BYTE $0x00
+
 // func mulWW(x, y Word) (z1, z0 Word)
 TEXT ·mulWW(SB),7,$0
 	MOVQ x+0(FP), AX
@ -137,7 +147,7 @@ TEXT ·addVW(SB),7,$0
 	MOVQ x+16(FP), R8
 	MOVQ y+32(FP), CX	// c = y
 	MOVQ z+0(FP), R10
-	
+
 	MOVQ $0, SI		// i = 0

 	// s/JL/JMP/ below to disable the unrolled loop
@ -151,15 +161,15 @@ U3:	// n >= 0
 	MOVQ 16(R8)(SI*8), R13
 	MOVQ 24(R8)(SI*8), R14
 	ADDQ CX, R11
+	ZERO_CX
 	ADCQ $0, R12
 	ADCQ $0, R13
 	ADCQ $0, R14
+	SETCS CX		// c = CF
 	MOVQ R11, 0(R10)(SI*8)
 	MOVQ R12, 8(R10)(SI*8)
 	MOVQ R13, 16(R10)(SI*8)
 	MOVQ R14, 24(R10)(SI*8)
-	RCLQ $1, CX		// c = CF
-	ANDQ $1, CX

 	ADDQ $4, SI		// i += 4
 	SUBQ $4, DI		// n -= 4
@ -171,8 +181,8 @@ V3:	ADDQ $4, DI		// n += 4
 L3:	// n > 0
 	ADDQ 0(R8)(SI*8), CX
 	MOVQ CX, 0(R10)(SI*8)
+	ZERO_CX
 	RCLQ $1, CX		// c = CF
-	ANDQ $1, CX

 	ADDQ $1, SI		// i++
 	SUBQ $1, DI		// n--
@ -203,15 +213,15 @@ U4:	// n >= 0
 	MOVQ 16(R8)(SI*8), R13
 	MOVQ 24(R8)(SI*8), R14
 	SUBQ CX, R11
+	ZERO_CX
 	SBBQ $0, R12
 	SBBQ $0, R13
 	SBBQ $0, R14
+	SETCS CX		// c = CF
 	MOVQ R11, 0(R10)(SI*8)
 	MOVQ R12, 8(R10)(SI*8)
 	MOVQ R13, 16(R10)(SI*8)
 	MOVQ R14, 24(R10)(SI*8)
-	RCLQ $1, CX		// c = CF
-	ANDQ $1, CX

 	ADDQ $4, SI		// i += 4
 	SUBQ $4, DI		// n -= 4
@ -224,8 +234,8 @@ L4:	// n > 0
 	MOVQ 0(R8)(SI*8), R11
 	SUBQ CX, R11
 	MOVQ R11, 0(R10)(SI*8)
+	ZERO_CX
 	RCLQ $1, CX		// c = CF
-	ANDQ $1, CX

 	ADDQ $1, SI		// i++
 	SUBQ $1, DI		// n--