1
0
mirror of https://github.com/golang/go synced 2024-11-25 07:17:56 -07:00

math/big: Replace RCLQ + ANDQ with SETCS in unrolled arithmetic assembly.

benchmark             old ns/op    new ns/op    delta
BenchmarkAddVW_1              8            8   +0.60%
BenchmarkAddVW_2             10            9   -8.64%
BenchmarkAddVW_3             10           10   -4.63%
BenchmarkAddVW_4             10           11   +3.67%
BenchmarkAddVW_5             11           12   +5.98%
BenchmarkAddVW_1e1           18           20   +6.38%
BenchmarkAddVW_1e2          129          115  -10.85%
BenchmarkAddVW_1e3         1270         1089  -14.25%
BenchmarkAddVW_1e4        13376        12145   -9.20%
BenchmarkAddVW_1e5       130392       125260   -3.94%

benchmark              old MB/s     new MB/s  speedup
BenchmarkAddVW_1        7709.10      7661.92    0.99x
BenchmarkAddVW_2       12451.10     13604.00    1.09x
BenchmarkAddVW_3       17727.81     18721.54    1.06x
BenchmarkAddVW_4       23552.64     22708.81    0.96x
BenchmarkAddVW_5       27411.40     25816.22    0.94x
BenchmarkAddVW_1e1     34063.19     32023.06    0.94x
BenchmarkAddVW_1e2     49529.97     55360.55    1.12x
BenchmarkAddVW_1e3     50380.44     58764.18    1.17x
BenchmarkAddVW_1e4     47843.59     52696.10    1.10x
BenchmarkAddVW_1e5     49082.60     51093.66    1.04x

R=gri, rsc, r
CC=golang-dev
https://golang.org/cl/6480063
This commit is contained in:
Christopher Swenson 2012-08-28 09:29:45 -07:00 committed by Robert Griesemer
parent f653dfeb49
commit baf426f10f

View File

@ -5,6 +5,16 @@
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
// Literal instruction for MOVQ $0, CX.
// (MOVQ $0, reg is translated to XORQ reg, reg and clears CF.)
#define ZERO_CX BYTE $0x48; \
BYTE $0xc7; \
BYTE $0xc1; \
BYTE $0x00; \
BYTE $0x00; \
BYTE $0x00; \
BYTE $0x00
// func mulWW(x, y Word) (z1, z0 Word)
TEXT ·mulWW(SB),7,$0
MOVQ x+0(FP), AX
@ -137,7 +147,7 @@ TEXT ·addVW(SB),7,$0
MOVQ x+16(FP), R8
MOVQ y+32(FP), CX // c = y
MOVQ z+0(FP), R10
MOVQ $0, SI // i = 0
// s/JL/JMP/ below to disable the unrolled loop
@ -151,15 +161,15 @@ U3: // n >= 0
MOVQ 16(R8)(SI*8), R13
MOVQ 24(R8)(SI*8), R14
ADDQ CX, R11
ZERO_CX
ADCQ $0, R12
ADCQ $0, R13
ADCQ $0, R14
SETCS CX // c = CF
MOVQ R11, 0(R10)(SI*8)
MOVQ R12, 8(R10)(SI*8)
MOVQ R13, 16(R10)(SI*8)
MOVQ R14, 24(R10)(SI*8)
RCLQ $1, CX // c = CF
ANDQ $1, CX
ADDQ $4, SI // i += 4
SUBQ $4, DI // n -= 4
@ -171,8 +181,8 @@ V3: ADDQ $4, DI // n += 4
L3: // n > 0
ADDQ 0(R8)(SI*8), CX
MOVQ CX, 0(R10)(SI*8)
ZERO_CX
RCLQ $1, CX // c = CF
ANDQ $1, CX
ADDQ $1, SI // i++
SUBQ $1, DI // n--
@ -203,15 +213,15 @@ U4: // n >= 0
MOVQ 16(R8)(SI*8), R13
MOVQ 24(R8)(SI*8), R14
SUBQ CX, R11
ZERO_CX
SBBQ $0, R12
SBBQ $0, R13
SBBQ $0, R14
SETCS CX // c = CF
MOVQ R11, 0(R10)(SI*8)
MOVQ R12, 8(R10)(SI*8)
MOVQ R13, 16(R10)(SI*8)
MOVQ R14, 24(R10)(SI*8)
RCLQ $1, CX // c = CF
ANDQ $1, CX
ADDQ $4, SI // i += 4
SUBQ $4, DI // n -= 4
@ -224,8 +234,8 @@ L4: // n > 0
MOVQ 0(R8)(SI*8), R11
SUBQ CX, R11
MOVQ R11, 0(R10)(SI*8)
ZERO_CX
RCLQ $1, CX // c = CF
ANDQ $1, CX
ADDQ $1, SI // i++
SUBQ $1, DI // n--