mirror of
https://github.com/golang/go
synced 2024-11-25 09:57:57 -07:00
math/big: minor tweaks to assembly code (slightly better performance)
Benchmarks run on 2.8GHz Quad-Code Intel Xeon, 4GB 800MHz DDR2 FB-DIMM ("PowerMac"). benchmark old ns/op new ns/op delta BenchmarkAddVV_1 7 7 -0.82% BenchmarkAddVV_2 8 8 -3.46% BenchmarkAddVV_3 10 9 -4.81% BenchmarkAddVV_4 9 9 -1.89% BenchmarkAddVV_5 11 10 -5.22% BenchmarkAddVV_1e1 17 18 +4.05% BenchmarkAddVV_1e2 117 115 -1.71% BenchmarkAddVV_1e3 1095 1090 -0.46% BenchmarkAddVV_1e4 13149 12679 -3.57% BenchmarkAddVV_1e5 135133 129482 -4.18% BenchmarkAddVW_1 6 6 -1.14% BenchmarkAddVW_2 7 7 +3.78% BenchmarkAddVW_3 8 8 +0.12% BenchmarkAddVW_4 8 8 -6.52% BenchmarkAddVW_5 9 8 -3.70% BenchmarkAddVW_1e1 14 13 -4.29% BenchmarkAddVW_1e2 97 96 -1.33% BenchmarkAddVW_1e3 953 940 -1.36% BenchmarkAddVW_1e4 9776 9527 -2.55% BenchmarkAddVW_1e5 102396 97738 -4.55% benchmark old MB/s new MB/s speedup BenchmarkAddVV_1 8702.84 8774.56 1.01x BenchmarkAddVV_2 14739.60 15277.82 1.04x BenchmarkAddVV_3 18375.37 19398.16 1.06x BenchmarkAddVV_4 26935.44 27464.68 1.02x BenchmarkAddVV_5 27754.04 29423.30 1.06x BenchmarkAddVV_1e1 37050.89 35629.72 0.96x BenchmarkAddVV_1e2 54289.15 55533.24 1.02x BenchmarkAddVV_1e3 58428.83 58682.53 1.00x BenchmarkAddVV_1e4 48670.55 50475.99 1.04x BenchmarkAddVV_1e5 47360.54 49427.66 1.04x BenchmarkAddVW_1 10397.27 10502.23 1.01x BenchmarkAddVW_2 17279.03 16654.13 0.96x BenchmarkAddVW_3 23858.39 23825.89 1.00x BenchmarkAddVW_4 29799.42 31895.06 1.07x BenchmarkAddVW_5 34781.83 36105.11 1.04x BenchmarkAddVW_1e1 45629.88 47597.42 1.04x BenchmarkAddVW_1e2 65341.93 66240.04 1.01x BenchmarkAddVW_1e3 67153.67 68069.83 1.01x BenchmarkAddVW_1e4 65464.60 67173.83 1.03x BenchmarkAddVW_1e5 62501.88 65480.66 1.05x R=iant CC=golang-dev https://golang.org/cl/6484056
This commit is contained in:
parent
35422bc11f
commit
3bd8684fac
@ -32,45 +32,44 @@ TEXT ·addVV(SB),7,$0
|
|||||||
MOVQ z+0(FP), R10
|
MOVQ z+0(FP), R10
|
||||||
|
|
||||||
MOVQ $0, CX // c = 0
|
MOVQ $0, CX // c = 0
|
||||||
MOVQ $0, SI // i = 0
|
MOVQ $0, SI // i = 0
|
||||||
|
|
||||||
// uncomment the next line to disable the unrolled loop
|
// uncomment the next line to disable the unrolled loop
|
||||||
// JMP V1
|
// JMP V1
|
||||||
|
|
||||||
CMPQ DI, $4
|
SUBQ $4, DI // n -= 4
|
||||||
JL V1 // if n < 4 goto V1
|
JL V1 // if n < 0 goto V1
|
||||||
|
|
||||||
U1: // n >= 4
|
U1: // n >= 0
|
||||||
// regular loop body unrolled 4x
|
// regular loop body unrolled 4x
|
||||||
|
RCRQ $1, CX // CF = c
|
||||||
MOVQ 0(R8)(SI*8), R11
|
MOVQ 0(R8)(SI*8), R11
|
||||||
MOVQ 8(R8)(SI*8), R12
|
MOVQ 8(R8)(SI*8), R12
|
||||||
MOVQ 16(R8)(SI*8), R13
|
MOVQ 16(R8)(SI*8), R13
|
||||||
MOVQ 24(R8)(SI*8), R14
|
MOVQ 24(R8)(SI*8), R14
|
||||||
RCRQ $1, CX // restore CF
|
|
||||||
ADCQ 0(R9)(SI*8), R11
|
ADCQ 0(R9)(SI*8), R11
|
||||||
ADCQ 8(R9)(SI*8), R12
|
ADCQ 8(R9)(SI*8), R12
|
||||||
ADCQ 16(R9)(SI*8), R13
|
ADCQ 16(R9)(SI*8), R13
|
||||||
ADCQ 24(R9)(SI*8), R14
|
ADCQ 24(R9)(SI*8), R14
|
||||||
RCLQ $1, CX // save CF
|
|
||||||
MOVQ R11, 0(R10)(SI*8)
|
MOVQ R11, 0(R10)(SI*8)
|
||||||
MOVQ R12, 8(R10)(SI*8)
|
MOVQ R12, 8(R10)(SI*8)
|
||||||
MOVQ R13, 16(R10)(SI*8)
|
MOVQ R13, 16(R10)(SI*8)
|
||||||
MOVQ R14, 24(R10)(SI*8)
|
MOVQ R14, 24(R10)(SI*8)
|
||||||
|
RCLQ $1, CX // c = CF
|
||||||
|
|
||||||
ADDQ $4, SI // i += 4
|
ADDQ $4, SI // i += 4
|
||||||
SUBQ $4, DI // n -= 4
|
SUBQ $4, DI // n -= 4
|
||||||
CMPQ DI, $4
|
JGE U1 // if n >= 0 goto U1
|
||||||
JGE U1 // if n >= 4 goto U1
|
|
||||||
|
|
||||||
V1: CMPQ DI, $0
|
V1: ADDQ $4, DI // n += 4
|
||||||
JLE E1 // if n <= 0 goto E1
|
JLE E1 // if n <= 0 goto E1
|
||||||
|
|
||||||
L1: // n > 0
|
L1: // n > 0
|
||||||
|
RCRQ $1, CX // CF = c
|
||||||
MOVQ 0(R8)(SI*8), R11
|
MOVQ 0(R8)(SI*8), R11
|
||||||
RCRQ $1, CX // restore CF
|
|
||||||
ADCQ 0(R9)(SI*8), R11
|
ADCQ 0(R9)(SI*8), R11
|
||||||
RCLQ $1, CX // save CF
|
|
||||||
MOVQ R11, 0(R10)(SI*8)
|
MOVQ R11, 0(R10)(SI*8)
|
||||||
|
RCLQ $1, CX // c = CF
|
||||||
|
|
||||||
ADDQ $1, SI // i++
|
ADDQ $1, SI // i++
|
||||||
SUBQ $1, DI // n--
|
SUBQ $1, DI // n--
|
||||||
@ -94,40 +93,39 @@ TEXT ·subVV(SB),7,$0
|
|||||||
// uncomment the next line to disable the unrolled loop
|
// uncomment the next line to disable the unrolled loop
|
||||||
// JMP V2
|
// JMP V2
|
||||||
|
|
||||||
CMPQ DI, $4
|
SUBQ $4, DI // n -= 4
|
||||||
JL V2 // if n < 4 goto V2
|
JL V2 // if n < 0 goto V2
|
||||||
|
|
||||||
U2: // n >= 4
|
U2: // n >= 0
|
||||||
// regular loop body unrolled 4x
|
// regular loop body unrolled 4x
|
||||||
|
RCRQ $1, CX // CF = c
|
||||||
MOVQ 0(R8)(SI*8), R11
|
MOVQ 0(R8)(SI*8), R11
|
||||||
MOVQ 8(R8)(SI*8), R12
|
MOVQ 8(R8)(SI*8), R12
|
||||||
MOVQ 16(R8)(SI*8), R13
|
MOVQ 16(R8)(SI*8), R13
|
||||||
MOVQ 24(R8)(SI*8), R14
|
MOVQ 24(R8)(SI*8), R14
|
||||||
RCRQ $1, CX // restore CF
|
|
||||||
SBBQ 0(R9)(SI*8), R11
|
SBBQ 0(R9)(SI*8), R11
|
||||||
SBBQ 8(R9)(SI*8), R12
|
SBBQ 8(R9)(SI*8), R12
|
||||||
SBBQ 16(R9)(SI*8), R13
|
SBBQ 16(R9)(SI*8), R13
|
||||||
SBBQ 24(R9)(SI*8), R14
|
SBBQ 24(R9)(SI*8), R14
|
||||||
RCLQ $1, CX // save CF
|
|
||||||
MOVQ R11, 0(R10)(SI*8)
|
MOVQ R11, 0(R10)(SI*8)
|
||||||
MOVQ R12, 8(R10)(SI*8)
|
MOVQ R12, 8(R10)(SI*8)
|
||||||
MOVQ R13, 16(R10)(SI*8)
|
MOVQ R13, 16(R10)(SI*8)
|
||||||
MOVQ R14, 24(R10)(SI*8)
|
MOVQ R14, 24(R10)(SI*8)
|
||||||
|
RCLQ $1, CX // c = CF
|
||||||
|
|
||||||
ADDQ $4, SI // i += 4
|
ADDQ $4, SI // i += 4
|
||||||
SUBQ $4, DI // n -= 4
|
SUBQ $4, DI // n -= 4
|
||||||
CMPQ DI, $4
|
JGE U2 // if n >= 0 goto U2
|
||||||
JGE U2 // if n >= 4 goto U2
|
|
||||||
|
|
||||||
V2: CMPQ DI, $0
|
V2: ADDQ $4, DI // n += 4
|
||||||
JLE E2 // if n <= 0 goto E2
|
JLE E2 // if n <= 0 goto E2
|
||||||
|
|
||||||
L2: // n > 0
|
L2: // n > 0
|
||||||
|
RCRQ $1, CX // CF = c
|
||||||
MOVQ 0(R8)(SI*8), R11
|
MOVQ 0(R8)(SI*8), R11
|
||||||
RCRQ $1, CX // restore CF
|
|
||||||
SBBQ 0(R9)(SI*8), R11
|
SBBQ 0(R9)(SI*8), R11
|
||||||
RCLQ $1, CX // save CF
|
|
||||||
MOVQ R11, 0(R10)(SI*8)
|
MOVQ R11, 0(R10)(SI*8)
|
||||||
|
RCLQ $1, CX // c = CF
|
||||||
|
|
||||||
ADDQ $1, SI // i++
|
ADDQ $1, SI // i++
|
||||||
SUBQ $1, DI // n--
|
SUBQ $1, DI // n--
|
||||||
@ -149,10 +147,10 @@ TEXT ·addVW(SB),7,$0
|
|||||||
// uncomment the next line to disable the unrolled loop
|
// uncomment the next line to disable the unrolled loop
|
||||||
// JMP V3
|
// JMP V3
|
||||||
|
|
||||||
CMPQ DI, $4
|
SUBQ $4, DI // n -= 4
|
||||||
JL V3 // if n < 4 goto V3
|
JL V3 // if n < 4 goto V3
|
||||||
|
|
||||||
U3: // n >= 4
|
U3: // n >= 0
|
||||||
// regular loop body unrolled 4x
|
// regular loop body unrolled 4x
|
||||||
MOVQ 0(R8)(SI*8), R11
|
MOVQ 0(R8)(SI*8), R11
|
||||||
MOVQ 8(R8)(SI*8), R12
|
MOVQ 8(R8)(SI*8), R12
|
||||||
@ -162,25 +160,24 @@ U3: // n >= 4
|
|||||||
ADCQ $0, R12
|
ADCQ $0, R12
|
||||||
ADCQ $0, R13
|
ADCQ $0, R13
|
||||||
ADCQ $0, R14
|
ADCQ $0, R14
|
||||||
RCLQ $1, CX
|
|
||||||
ANDQ $1, CX
|
|
||||||
MOVQ R11, 0(R10)(SI*8)
|
MOVQ R11, 0(R10)(SI*8)
|
||||||
MOVQ R12, 8(R10)(SI*8)
|
MOVQ R12, 8(R10)(SI*8)
|
||||||
MOVQ R13, 16(R10)(SI*8)
|
MOVQ R13, 16(R10)(SI*8)
|
||||||
MOVQ R14, 24(R10)(SI*8)
|
MOVQ R14, 24(R10)(SI*8)
|
||||||
|
RCLQ $1, CX // c = CF
|
||||||
|
ANDQ $1, CX
|
||||||
|
|
||||||
ADDQ $4, SI // i += 4
|
ADDQ $4, SI // i += 4
|
||||||
SUBQ $4, DI // n -= 4
|
SUBQ $4, DI // n -= 4
|
||||||
CMPQ DI, $4
|
JGE U3 // if n >= 0 goto U3
|
||||||
JGE U3 // if n >= 4 goto U3
|
|
||||||
|
|
||||||
V3: CMPQ DI, $0
|
V3: ADDQ $4, DI // n += 4
|
||||||
JLE E3 // if n <= 0 goto E3
|
JLE E3 // if n <= 0 goto E3
|
||||||
|
|
||||||
L3: // n > 0
|
L3: // n > 0
|
||||||
ADDQ 0(R8)(SI*8), CX
|
ADDQ 0(R8)(SI*8), CX
|
||||||
MOVQ CX, 0(R10)(SI*8)
|
MOVQ CX, 0(R10)(SI*8)
|
||||||
RCLQ $1, CX
|
RCLQ $1, CX // c = CF
|
||||||
ANDQ $1, CX
|
ANDQ $1, CX
|
||||||
|
|
||||||
ADDQ $1, SI // i++
|
ADDQ $1, SI // i++
|
||||||
@ -204,10 +201,10 @@ TEXT ·subVW(SB),7,$0
|
|||||||
// uncomment the next line to disable the unrolled loop
|
// uncomment the next line to disable the unrolled loop
|
||||||
// JMP V4
|
// JMP V4
|
||||||
|
|
||||||
CMPQ DI, $4
|
SUBQ $4, DI // n -= 4
|
||||||
JL V4 // if n < 4 goto V4
|
JL V4 // if n < 4 goto V4
|
||||||
|
|
||||||
U4: // n >= 4
|
U4: // n >= 0
|
||||||
// regular loop body unrolled 4x
|
// regular loop body unrolled 4x
|
||||||
MOVQ 0(R8)(SI*8), R11
|
MOVQ 0(R8)(SI*8), R11
|
||||||
MOVQ 8(R8)(SI*8), R12
|
MOVQ 8(R8)(SI*8), R12
|
||||||
@ -217,26 +214,25 @@ U4: // n >= 4
|
|||||||
SBBQ $0, R12
|
SBBQ $0, R12
|
||||||
SBBQ $0, R13
|
SBBQ $0, R13
|
||||||
SBBQ $0, R14
|
SBBQ $0, R14
|
||||||
RCLQ $1, CX
|
|
||||||
ANDQ $1, CX
|
|
||||||
MOVQ R11, 0(R10)(SI*8)
|
MOVQ R11, 0(R10)(SI*8)
|
||||||
MOVQ R12, 8(R10)(SI*8)
|
MOVQ R12, 8(R10)(SI*8)
|
||||||
MOVQ R13, 16(R10)(SI*8)
|
MOVQ R13, 16(R10)(SI*8)
|
||||||
MOVQ R14, 24(R10)(SI*8)
|
MOVQ R14, 24(R10)(SI*8)
|
||||||
|
RCLQ $1, CX // c = CF
|
||||||
|
ANDQ $1, CX
|
||||||
|
|
||||||
ADDQ $4, SI // i += 4
|
ADDQ $4, SI // i += 4
|
||||||
SUBQ $4, DI // n -= 4
|
SUBQ $4, DI // n -= 4
|
||||||
CMPQ DI, $4
|
JGE U4 // if n >= 0 goto U4
|
||||||
JGE U4 // if n >= 4 goto U4
|
|
||||||
|
|
||||||
V4: CMPQ DI, $0
|
V4: ADDQ $4, DI // n += 4
|
||||||
JLE E4 // if n <= 0 goto E4
|
JLE E4 // if n <= 0 goto E4
|
||||||
|
|
||||||
L4: // n > 0
|
L4: // n > 0
|
||||||
MOVQ 0(R8)(SI*8), R11
|
MOVQ 0(R8)(SI*8), R11
|
||||||
SUBQ CX, R11
|
SUBQ CX, R11
|
||||||
MOVQ R11, 0(R10)(SI*8)
|
MOVQ R11, 0(R10)(SI*8)
|
||||||
RCLQ $1, CX
|
RCLQ $1, CX // c = CF
|
||||||
ANDQ $1, CX
|
ANDQ $1, CX
|
||||||
|
|
||||||
ADDQ $1, SI // i++
|
ADDQ $1, SI // i++
|
||||||
|
Loading…
Reference in New Issue
Block a user