mirror of
https://github.com/golang/go
synced 2024-11-22 03:44:39 -07:00
math/big: Replace RCLQ + ANDQ with SETCS in unrolled arithmetic assembly.
benchmark old ns/op new ns/op delta BenchmarkAddVW_1 8 8 +0.60% BenchmarkAddVW_2 10 9 -8.64% BenchmarkAddVW_3 10 10 -4.63% BenchmarkAddVW_4 10 11 +3.67% BenchmarkAddVW_5 11 12 +5.98% BenchmarkAddVW_1e1 18 20 +6.38% BenchmarkAddVW_1e2 129 115 -10.85% BenchmarkAddVW_1e3 1270 1089 -14.25% BenchmarkAddVW_1e4 13376 12145 -9.20% BenchmarkAddVW_1e5 130392 125260 -3.94% benchmark old MB/s new MB/s speedup BenchmarkAddVW_1 7709.10 7661.92 0.99x BenchmarkAddVW_2 12451.10 13604.00 1.09x BenchmarkAddVW_3 17727.81 18721.54 1.06x BenchmarkAddVW_4 23552.64 22708.81 0.96x BenchmarkAddVW_5 27411.40 25816.22 0.94x BenchmarkAddVW_1e1 34063.19 32023.06 0.94x BenchmarkAddVW_1e2 49529.97 55360.55 1.12x BenchmarkAddVW_1e3 50380.44 58764.18 1.17x BenchmarkAddVW_1e4 47843.59 52696.10 1.10x BenchmarkAddVW_1e5 49082.60 51093.66 1.04x R=gri, rsc, r CC=golang-dev https://golang.org/cl/6480063
This commit is contained in:
parent
f653dfeb49
commit
baf426f10f
@ -5,6 +5,16 @@
|
|||||||
// This file provides fast assembly versions for the elementary
|
// This file provides fast assembly versions for the elementary
|
||||||
// arithmetic operations on vectors implemented in arith.go.
|
// arithmetic operations on vectors implemented in arith.go.
|
||||||
|
|
||||||
|
// Literal instruction for MOVQ $0, CX.
|
||||||
|
// (MOVQ $0, reg is translated to XORQ reg, reg and clears CF.)
|
||||||
|
#define ZERO_CX BYTE $0x48; \
|
||||||
|
BYTE $0xc7; \
|
||||||
|
BYTE $0xc1; \
|
||||||
|
BYTE $0x00; \
|
||||||
|
BYTE $0x00; \
|
||||||
|
BYTE $0x00; \
|
||||||
|
BYTE $0x00
|
||||||
|
|
||||||
// func mulWW(x, y Word) (z1, z0 Word)
|
// func mulWW(x, y Word) (z1, z0 Word)
|
||||||
TEXT ·mulWW(SB),7,$0
|
TEXT ·mulWW(SB),7,$0
|
||||||
MOVQ x+0(FP), AX
|
MOVQ x+0(FP), AX
|
||||||
@ -137,7 +147,7 @@ TEXT ·addVW(SB),7,$0
|
|||||||
MOVQ x+16(FP), R8
|
MOVQ x+16(FP), R8
|
||||||
MOVQ y+32(FP), CX // c = y
|
MOVQ y+32(FP), CX // c = y
|
||||||
MOVQ z+0(FP), R10
|
MOVQ z+0(FP), R10
|
||||||
|
|
||||||
MOVQ $0, SI // i = 0
|
MOVQ $0, SI // i = 0
|
||||||
|
|
||||||
// s/JL/JMP/ below to disable the unrolled loop
|
// s/JL/JMP/ below to disable the unrolled loop
|
||||||
@ -151,15 +161,15 @@ U3: // n >= 0
|
|||||||
MOVQ 16(R8)(SI*8), R13
|
MOVQ 16(R8)(SI*8), R13
|
||||||
MOVQ 24(R8)(SI*8), R14
|
MOVQ 24(R8)(SI*8), R14
|
||||||
ADDQ CX, R11
|
ADDQ CX, R11
|
||||||
|
ZERO_CX
|
||||||
ADCQ $0, R12
|
ADCQ $0, R12
|
||||||
ADCQ $0, R13
|
ADCQ $0, R13
|
||||||
ADCQ $0, R14
|
ADCQ $0, R14
|
||||||
|
SETCS CX // c = CF
|
||||||
MOVQ R11, 0(R10)(SI*8)
|
MOVQ R11, 0(R10)(SI*8)
|
||||||
MOVQ R12, 8(R10)(SI*8)
|
MOVQ R12, 8(R10)(SI*8)
|
||||||
MOVQ R13, 16(R10)(SI*8)
|
MOVQ R13, 16(R10)(SI*8)
|
||||||
MOVQ R14, 24(R10)(SI*8)
|
MOVQ R14, 24(R10)(SI*8)
|
||||||
RCLQ $1, CX // c = CF
|
|
||||||
ANDQ $1, CX
|
|
||||||
|
|
||||||
ADDQ $4, SI // i += 4
|
ADDQ $4, SI // i += 4
|
||||||
SUBQ $4, DI // n -= 4
|
SUBQ $4, DI // n -= 4
|
||||||
@ -171,8 +181,8 @@ V3: ADDQ $4, DI // n += 4
|
|||||||
L3: // n > 0
|
L3: // n > 0
|
||||||
ADDQ 0(R8)(SI*8), CX
|
ADDQ 0(R8)(SI*8), CX
|
||||||
MOVQ CX, 0(R10)(SI*8)
|
MOVQ CX, 0(R10)(SI*8)
|
||||||
|
ZERO_CX
|
||||||
RCLQ $1, CX // c = CF
|
RCLQ $1, CX // c = CF
|
||||||
ANDQ $1, CX
|
|
||||||
|
|
||||||
ADDQ $1, SI // i++
|
ADDQ $1, SI // i++
|
||||||
SUBQ $1, DI // n--
|
SUBQ $1, DI // n--
|
||||||
@ -203,15 +213,15 @@ U4: // n >= 0
|
|||||||
MOVQ 16(R8)(SI*8), R13
|
MOVQ 16(R8)(SI*8), R13
|
||||||
MOVQ 24(R8)(SI*8), R14
|
MOVQ 24(R8)(SI*8), R14
|
||||||
SUBQ CX, R11
|
SUBQ CX, R11
|
||||||
|
ZERO_CX
|
||||||
SBBQ $0, R12
|
SBBQ $0, R12
|
||||||
SBBQ $0, R13
|
SBBQ $0, R13
|
||||||
SBBQ $0, R14
|
SBBQ $0, R14
|
||||||
|
SETCS CX // c = CF
|
||||||
MOVQ R11, 0(R10)(SI*8)
|
MOVQ R11, 0(R10)(SI*8)
|
||||||
MOVQ R12, 8(R10)(SI*8)
|
MOVQ R12, 8(R10)(SI*8)
|
||||||
MOVQ R13, 16(R10)(SI*8)
|
MOVQ R13, 16(R10)(SI*8)
|
||||||
MOVQ R14, 24(R10)(SI*8)
|
MOVQ R14, 24(R10)(SI*8)
|
||||||
RCLQ $1, CX // c = CF
|
|
||||||
ANDQ $1, CX
|
|
||||||
|
|
||||||
ADDQ $4, SI // i += 4
|
ADDQ $4, SI // i += 4
|
||||||
SUBQ $4, DI // n -= 4
|
SUBQ $4, DI // n -= 4
|
||||||
@ -224,8 +234,8 @@ L4: // n > 0
|
|||||||
MOVQ 0(R8)(SI*8), R11
|
MOVQ 0(R8)(SI*8), R11
|
||||||
SUBQ CX, R11
|
SUBQ CX, R11
|
||||||
MOVQ R11, 0(R10)(SI*8)
|
MOVQ R11, 0(R10)(SI*8)
|
||||||
|
ZERO_CX
|
||||||
RCLQ $1, CX // c = CF
|
RCLQ $1, CX // c = CF
|
||||||
ANDQ $1, CX
|
|
||||||
|
|
||||||
ADDQ $1, SI // i++
|
ADDQ $1, SI // i++
|
||||||
SUBQ $1, DI // n--
|
SUBQ $1, DI // n--
|
||||||
|
Loading…
Reference in New Issue
Block a user