1
0
mirror of https://github.com/golang/go synced 2024-11-18 08:44:43 -07:00

math/big: faster (add|sub)V(V|W) routines

Benchmarks run on 3.06GHz Intel Core 2 Duo,
4GB 800MHz DDR2 SDRAM ("iMac").

benchmark             old ns/op    new ns/op    delta
BenchmarkAddVV_1              6            6   +2.75%
BenchmarkAddVV_2              9            7  -19.71%
BenchmarkAddVV_3              9            9   +2.25%
BenchmarkAddVV_4             10            8  -20.46%
BenchmarkAddVV_5             12           10  -19.53%
BenchmarkAddVV_1e1           23           15  -32.48%
BenchmarkAddVV_1e2          213          107  -49.77%
BenchmarkAddVV_1e3         2088          993  -52.44%
BenchmarkAddVV_1e4        20874        12027  -42.38%
BenchmarkAddVV_1e5       209858       121480  -42.11%
BenchmarkAddVW_1              5            5   +0.90%
BenchmarkAddVW_2             11           11   -3.51%
BenchmarkAddVW_3              7            7   -0.27%
BenchmarkAddVW_4              8            7   -6.32%
BenchmarkAddVW_5              9            8  -10.89%
BenchmarkAddVW_1e1           17           12  -26.01%
BenchmarkAddVW_1e2          155           89  -42.32%
BenchmarkAddVW_1e3         1479          873  -40.97%
BenchmarkAddVW_1e4        13838         8764  -36.67%
BenchmarkAddVW_1e5       147353        89560  -39.22%

benchmark              old MB/s     new MB/s  speedup
BenchmarkAddVV_1        9765.57      9508.55    0.97x
BenchmarkAddVV_2       13077.63     16284.97    1.25x
BenchmarkAddVV_3       20599.58     20156.67    0.98x
BenchmarkAddVV_4       23591.58     29516.02    1.25x
BenchmarkAddVV_5       24920.95     31194.10    1.25x
BenchmarkAddVV_1e1     27393.76     40621.71    1.48x
BenchmarkAddVV_1e2     29911.96     59592.99    1.99x
BenchmarkAddVV_1e3     30650.73     64429.84    2.10x
BenchmarkAddVV_1e4     30660.09     53213.08    1.74x
BenchmarkAddVV_1e5     30496.74     52683.46    1.73x
BenchmarkAddVW_1       11503.39     11405.98    0.99x
BenchmarkAddVW_2       11203.56     11586.92    1.03x
BenchmarkAddVW_3       26173.45     26224.75    1.00x
BenchmarkAddVW_4       30560.30     32621.94    1.07x
BenchmarkAddVW_5       33183.81     37269.94    1.12x
BenchmarkAddVW_1e1     36991.75     50098.53    1.35x
BenchmarkAddVW_1e2     41087.14     71549.93    1.74x
BenchmarkAddVW_1e3     43266.42     73279.83    1.69x
BenchmarkAddVW_1e4     46246.74     73021.97    1.58x
BenchmarkAddVW_1e5     43433.00     71459.96    1.65x

Benchmarks run on 2.8GHz Quad-Code Intel Xeon,
4GB 800MHz DDR2 FB-DIMM ("PowerMac").

benchmark             old ns/op    new ns/op    delta
BenchmarkAddVV_1              7            7   +2.51%
BenchmarkAddVV_2              8            8   +3.70%
BenchmarkAddVV_3             10           10   +4.00%
BenchmarkAddVV_4             11            9  -19.49%
BenchmarkAddVV_5             14           11  -18.44%
BenchmarkAddVV_1e1           23           17  -27.00%
BenchmarkAddVV_1e2          234          117  -50.00%
BenchmarkAddVV_1e3         2284         1095  -52.06%
BenchmarkAddVV_1e4        22906        13149  -42.60%
BenchmarkAddVV_1e5       229860       135133  -41.21%
BenchmarkAddVW_1              6            6   +1.15%
BenchmarkAddVW_2              7            7   +1.37%
BenchmarkAddVW_3              7            8   +1.00%
BenchmarkAddVW_4              9            8   -6.93%
BenchmarkAddVW_5             10            9  -13.21%
BenchmarkAddVW_1e1           18           14  -24.32%
BenchmarkAddVW_1e2          170           97  -42.41%
BenchmarkAddVW_1e3         1619          953  -41.14%
BenchmarkAddVW_1e4        15142         9776  -35.44%
BenchmarkAddVW_1e5       160835       102396  -36.33%

benchmark              old MB/s     new MB/s  speedup
BenchmarkAddVV_1        8928.95      8702.84    0.97x
BenchmarkAddVV_2       15298.84     14739.60    0.96x
BenchmarkAddVV_3       19116.52     18375.37    0.96x
BenchmarkAddVV_4       21644.30     26935.44    1.24x
BenchmarkAddVV_5       22771.64     27754.04    1.22x
BenchmarkAddVV_1e1     27017.62     37050.89    1.37x
BenchmarkAddVV_1e2     27326.09     54289.15    1.99x
BenchmarkAddVV_1e3     28016.84     58428.83    2.09x
BenchmarkAddVV_1e4     27939.38     48670.55    1.74x
BenchmarkAddVV_1e5     27843.00     47360.54    1.70x
BenchmarkAddVW_1       10510.97     10397.27    0.99x
BenchmarkAddVW_2       17499.71     17279.03    0.99x
BenchmarkAddVW_3       24093.93     23858.39    0.99x
BenchmarkAddVW_4       27733.08     29799.42    1.07x
BenchmarkAddVW_5       30267.17     34781.83    1.15x
BenchmarkAddVW_1e1     34566.78     45629.88    1.32x
BenchmarkAddVW_1e2     37521.89     65341.93    1.74x
BenchmarkAddVW_1e3     39513.18     67153.67    1.70x
BenchmarkAddVW_1e4     42263.80     65464.60    1.55x
BenchmarkAddVW_1e5     39792.21     62501.88    1.57x

R=iant, remyoudompheng, nightlyone, minux.ma
CC=golang-dev
https://golang.org/cl/6482062
This commit is contained in:
Robert Griesemer 2012-08-24 09:20:44 -07:00
parent c61a185f35
commit 35422bc11f

View File

@ -5,8 +5,6 @@
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
// TODO(gri) - experiment with unrolled loops for faster execution
// func mulWW(x, y Word) (z1, z0 Word)
TEXT ·mulWW(SB),7,$0
MOVQ x+0(FP), AX
@ -28,95 +26,224 @@ TEXT ·divWW(SB),7,$0
// func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB),7,$0
MOVQ z+0(FP), R10
MOVL n+8(FP), DI
MOVQ x+16(FP), R8
MOVQ y+32(FP), R9
MOVL n+8(FP), R11
MOVQ $0, BX // i = 0
MOVQ $0, DX // c = 0
JMP E1
MOVQ z+0(FP), R10
L1: MOVQ (R8)(BX*8), AX
RCRQ $1, DX
ADCQ (R9)(BX*8), AX
RCLQ $1, DX
MOVQ AX, (R10)(BX*8)
ADDL $1, BX // i++
MOVQ $0, CX // c = 0
MOVQ $0, SI // i = 0
E1: CMPQ BX, R11 // i < n
JL L1
// uncomment the next line to disable the unrolled loop
// JMP V1
CMPQ DI, $4
JL V1 // if n < 4 goto V1
MOVQ DX, c+48(FP)
U1: // n >= 4
// regular loop body unrolled 4x
MOVQ 0(R8)(SI*8), R11
MOVQ 8(R8)(SI*8), R12
MOVQ 16(R8)(SI*8), R13
MOVQ 24(R8)(SI*8), R14
RCRQ $1, CX // restore CF
ADCQ 0(R9)(SI*8), R11
ADCQ 8(R9)(SI*8), R12
ADCQ 16(R9)(SI*8), R13
ADCQ 24(R9)(SI*8), R14
RCLQ $1, CX // save CF
MOVQ R11, 0(R10)(SI*8)
MOVQ R12, 8(R10)(SI*8)
MOVQ R13, 16(R10)(SI*8)
MOVQ R14, 24(R10)(SI*8)
ADDQ $4, SI // i += 4
SUBQ $4, DI // n -= 4
CMPQ DI, $4
JGE U1 // if n >= 4 goto U1
V1: CMPQ DI, $0
JLE E1 // if n <= 0 goto E1
L1: // n > 0
MOVQ 0(R8)(SI*8), R11
RCRQ $1, CX // restore CF
ADCQ 0(R9)(SI*8), R11
RCLQ $1, CX // save CF
MOVQ R11, 0(R10)(SI*8)
ADDQ $1, SI // i++
SUBQ $1, DI // n--
JG L1 // if n > 0 goto L1
E1: MOVQ CX, c+48(FP) // return c
RET
// func subVV(z, x, y []Word) (c Word)
// (same as addVV_s except for SBBQ instead of ADCQ and label names)
// (same as addVV except for SBBQ instead of ADCQ and label names)
TEXT ·subVV(SB),7,$0
MOVQ z+0(FP), R10
MOVL n+8(FP), DI
MOVQ x+16(FP), R8
MOVQ y+32(FP), R9
MOVL n+8(FP), R11
MOVQ $0, BX // i = 0
MOVQ $0, DX // c = 0
JMP E2
MOVQ z+0(FP), R10
L2: MOVQ (R8)(BX*8), AX
RCRQ $1, DX
SBBQ (R9)(BX*8), AX
RCLQ $1, DX
MOVQ AX, (R10)(BX*8)
ADDL $1, BX // i++
MOVQ $0, CX // c = 0
MOVQ $0, SI // i = 0
E2: CMPQ BX, R11 // i < n
JL L2
// uncomment the next line to disable the unrolled loop
// JMP V2
CMPQ DI, $4
JL V2 // if n < 4 goto V2
MOVQ DX, c+48(FP)
U2: // n >= 4
// regular loop body unrolled 4x
MOVQ 0(R8)(SI*8), R11
MOVQ 8(R8)(SI*8), R12
MOVQ 16(R8)(SI*8), R13
MOVQ 24(R8)(SI*8), R14
RCRQ $1, CX // restore CF
SBBQ 0(R9)(SI*8), R11
SBBQ 8(R9)(SI*8), R12
SBBQ 16(R9)(SI*8), R13
SBBQ 24(R9)(SI*8), R14
RCLQ $1, CX // save CF
MOVQ R11, 0(R10)(SI*8)
MOVQ R12, 8(R10)(SI*8)
MOVQ R13, 16(R10)(SI*8)
MOVQ R14, 24(R10)(SI*8)
ADDQ $4, SI // i += 4
SUBQ $4, DI // n -= 4
CMPQ DI, $4
JGE U2 // if n >= 4 goto U2
V2: CMPQ DI, $0
JLE E2 // if n <= 0 goto E2
L2: // n > 0
MOVQ 0(R8)(SI*8), R11
RCRQ $1, CX // restore CF
SBBQ 0(R9)(SI*8), R11
RCLQ $1, CX // save CF
MOVQ R11, 0(R10)(SI*8)
ADDQ $1, SI // i++
SUBQ $1, DI // n--
JG L2 // if n > 0 goto L2
E2: MOVQ CX, c+48(FP) // return c
RET
// func addVW(z, x []Word, y Word) (c Word)
TEXT ·addVW(SB),7,$0
MOVQ z+0(FP), R10
MOVL n+8(FP), DI
MOVQ x+16(FP), R8
MOVQ y+32(FP), AX // c = y
MOVL n+8(FP), R11
MOVQ $0, BX // i = 0
JMP E3
MOVQ y+32(FP), CX // c = y
MOVQ z+0(FP), R10
MOVQ $0, SI // i = 0
L3: ADDQ (R8)(BX*8), AX
MOVQ AX, (R10)(BX*8)
RCLQ $1, AX
ANDQ $1, AX
ADDL $1, BX // i++
// uncomment the next line to disable the unrolled loop
// JMP V3
CMPQ DI, $4
JL V3 // if n < 4 goto V3
E3: CMPQ BX, R11 // i < n
JL L3
U3: // n >= 4
// regular loop body unrolled 4x
MOVQ 0(R8)(SI*8), R11
MOVQ 8(R8)(SI*8), R12
MOVQ 16(R8)(SI*8), R13
MOVQ 24(R8)(SI*8), R14
ADDQ CX, R11
ADCQ $0, R12
ADCQ $0, R13
ADCQ $0, R14
RCLQ $1, CX
ANDQ $1, CX
MOVQ R11, 0(R10)(SI*8)
MOVQ R12, 8(R10)(SI*8)
MOVQ R13, 16(R10)(SI*8)
MOVQ R14, 24(R10)(SI*8)
MOVQ AX, c+40(FP)
ADDQ $4, SI // i += 4
SUBQ $4, DI // n -= 4
CMPQ DI, $4
JGE U3 // if n >= 4 goto U3
V3: CMPQ DI, $0
JLE E3 // if n <= 0 goto E3
L3: // n > 0
ADDQ 0(R8)(SI*8), CX
MOVQ CX, 0(R10)(SI*8)
RCLQ $1, CX
ANDQ $1, CX
ADDQ $1, SI // i++
SUBQ $1, DI // n--
JG L3 // if n > 0 goto L3
E3: MOVQ CX, c+40(FP) // return c
RET
// func subVW(z, x []Word, y Word) (c Word)
// (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
TEXT ·subVW(SB),7,$0
MOVQ z+0(FP), R10
MOVL n+8(FP), DI
MOVQ x+16(FP), R8
MOVQ y+32(FP), AX // c = y
MOVL n+8(FP), R11
MOVQ $0, BX // i = 0
JMP E4
MOVQ y+32(FP), CX // c = y
MOVQ z+0(FP), R10
MOVQ $0, SI // i = 0
L4: MOVQ (R8)(BX*8), DX // TODO(gri) is there a reverse SUBQ?
SUBQ AX, DX
MOVQ DX, (R10)(BX*8)
RCLQ $1, AX
ANDQ $1, AX
ADDL $1, BX // i++
// uncomment the next line to disable the unrolled loop
// JMP V4
CMPQ DI, $4
JL V4 // if n < 4 goto V4
E4: CMPQ BX, R11 // i < n
JL L4
U4: // n >= 4
// regular loop body unrolled 4x
MOVQ 0(R8)(SI*8), R11
MOVQ 8(R8)(SI*8), R12
MOVQ 16(R8)(SI*8), R13
MOVQ 24(R8)(SI*8), R14
SUBQ CX, R11
SBBQ $0, R12
SBBQ $0, R13
SBBQ $0, R14
RCLQ $1, CX
ANDQ $1, CX
MOVQ R11, 0(R10)(SI*8)
MOVQ R12, 8(R10)(SI*8)
MOVQ R13, 16(R10)(SI*8)
MOVQ R14, 24(R10)(SI*8)
MOVQ AX, c+40(FP)
ADDQ $4, SI // i += 4
SUBQ $4, DI // n -= 4
CMPQ DI, $4
JGE U4 // if n >= 4 goto U4
V4: CMPQ DI, $0
JLE E4 // if n <= 0 goto E4
L4: // n > 0
MOVQ 0(R8)(SI*8), R11
SUBQ CX, R11
MOVQ R11, 0(R10)(SI*8)
RCLQ $1, CX
ANDQ $1, CX
ADDQ $1, SI // i++
SUBQ $1, DI // n--
JG L4 // if n > 0 goto L4
E4: MOVQ CX, c+40(FP) // return c
RET