mirror of
https://github.com/golang/go
synced 2024-11-21 21:34:40 -07:00
math/big: Replace RCLQ + ANDQ with SETCS in unrolled arithmetic assembly.
benchmark old ns/op new ns/op delta BenchmarkAddVW_1 8 8 +0.60% BenchmarkAddVW_2 10 9 -8.64% BenchmarkAddVW_3 10 10 -4.63% BenchmarkAddVW_4 10 11 +3.67% BenchmarkAddVW_5 11 12 +5.98% BenchmarkAddVW_1e1 18 20 +6.38% BenchmarkAddVW_1e2 129 115 -10.85% BenchmarkAddVW_1e3 1270 1089 -14.25% BenchmarkAddVW_1e4 13376 12145 -9.20% BenchmarkAddVW_1e5 130392 125260 -3.94% benchmark old MB/s new MB/s speedup BenchmarkAddVW_1 7709.10 7661.92 0.99x BenchmarkAddVW_2 12451.10 13604.00 1.09x BenchmarkAddVW_3 17727.81 18721.54 1.06x BenchmarkAddVW_4 23552.64 22708.81 0.96x BenchmarkAddVW_5 27411.40 25816.22 0.94x BenchmarkAddVW_1e1 34063.19 32023.06 0.94x BenchmarkAddVW_1e2 49529.97 55360.55 1.12x BenchmarkAddVW_1e3 50380.44 58764.18 1.17x BenchmarkAddVW_1e4 47843.59 52696.10 1.10x BenchmarkAddVW_1e5 49082.60 51093.66 1.04x R=gri, rsc, r CC=golang-dev https://golang.org/cl/6480063
This commit is contained in:
parent
f653dfeb49
commit
baf426f10f
@ -5,6 +5,16 @@
|
||||
// This file provides fast assembly versions for the elementary
|
||||
// arithmetic operations on vectors implemented in arith.go.
|
||||
|
||||
// Literal instruction for MOVQ $0, CX.
|
||||
// (MOVQ $0, reg is translated to XORQ reg, reg and clears CF.)
|
||||
#define ZERO_CX BYTE $0x48; \
|
||||
BYTE $0xc7; \
|
||||
BYTE $0xc1; \
|
||||
BYTE $0x00; \
|
||||
BYTE $0x00; \
|
||||
BYTE $0x00; \
|
||||
BYTE $0x00
|
||||
|
||||
// func mulWW(x, y Word) (z1, z0 Word)
|
||||
TEXT ·mulWW(SB),7,$0
|
||||
MOVQ x+0(FP), AX
|
||||
@ -137,7 +147,7 @@ TEXT ·addVW(SB),7,$0
|
||||
MOVQ x+16(FP), R8
|
||||
MOVQ y+32(FP), CX // c = y
|
||||
MOVQ z+0(FP), R10
|
||||
|
||||
|
||||
MOVQ $0, SI // i = 0
|
||||
|
||||
// s/JL/JMP/ below to disable the unrolled loop
|
||||
@ -151,15 +161,15 @@ U3: // n >= 0
|
||||
MOVQ 16(R8)(SI*8), R13
|
||||
MOVQ 24(R8)(SI*8), R14
|
||||
ADDQ CX, R11
|
||||
ZERO_CX
|
||||
ADCQ $0, R12
|
||||
ADCQ $0, R13
|
||||
ADCQ $0, R14
|
||||
SETCS CX // c = CF
|
||||
MOVQ R11, 0(R10)(SI*8)
|
||||
MOVQ R12, 8(R10)(SI*8)
|
||||
MOVQ R13, 16(R10)(SI*8)
|
||||
MOVQ R14, 24(R10)(SI*8)
|
||||
RCLQ $1, CX // c = CF
|
||||
ANDQ $1, CX
|
||||
|
||||
ADDQ $4, SI // i += 4
|
||||
SUBQ $4, DI // n -= 4
|
||||
@ -171,8 +181,8 @@ V3: ADDQ $4, DI // n += 4
|
||||
L3: // n > 0
|
||||
ADDQ 0(R8)(SI*8), CX
|
||||
MOVQ CX, 0(R10)(SI*8)
|
||||
ZERO_CX
|
||||
RCLQ $1, CX // c = CF
|
||||
ANDQ $1, CX
|
||||
|
||||
ADDQ $1, SI // i++
|
||||
SUBQ $1, DI // n--
|
||||
@ -203,15 +213,15 @@ U4: // n >= 0
|
||||
MOVQ 16(R8)(SI*8), R13
|
||||
MOVQ 24(R8)(SI*8), R14
|
||||
SUBQ CX, R11
|
||||
ZERO_CX
|
||||
SBBQ $0, R12
|
||||
SBBQ $0, R13
|
||||
SBBQ $0, R14
|
||||
SETCS CX // c = CF
|
||||
MOVQ R11, 0(R10)(SI*8)
|
||||
MOVQ R12, 8(R10)(SI*8)
|
||||
MOVQ R13, 16(R10)(SI*8)
|
||||
MOVQ R14, 24(R10)(SI*8)
|
||||
RCLQ $1, CX // c = CF
|
||||
ANDQ $1, CX
|
||||
|
||||
ADDQ $4, SI // i += 4
|
||||
SUBQ $4, DI // n -= 4
|
||||
@ -224,8 +234,8 @@ L4: // n > 0
|
||||
MOVQ 0(R8)(SI*8), R11
|
||||
SUBQ CX, R11
|
||||
MOVQ R11, 0(R10)(SI*8)
|
||||
ZERO_CX
|
||||
RCLQ $1, CX // c = CF
|
||||
ANDQ $1, CX
|
||||
|
||||
ADDQ $1, SI // i++
|
||||
SUBQ $1, DI // n--
|
||||
|
Loading…
Reference in New Issue
Block a user