mirror of
https://github.com/golang/go
synced 2024-11-18 08:04:40 -07:00
math/big: uses SIMD for some math big functions on s390x
The following benchmarks are improved by the amounts shown (Others unaffected beyond the level of noise.) Also adds a test to confirm non-SIMD implementation still correct, even when run on SIMD-capable machine Benchmark old new BenchmarkAddVV/100-18 66148.08 MB/s 117546.19 MB/s 1.8x BenchmarkAddVV/1000-18 70168.27 MB/s 133478.96 MB/s 1.9x BenchmarkAddVV/10000-18 67489.80 MB/s 100010.79 MB/s 1.5x BenchmarkAddVV/100000-18 54329.99 MB/s 69232.45 MB/s 1.3x BenchmarkAddVW/100-18 9929.10 MB/s 14841.31 MB/s 1.5x BenchmarkAddVW/1000-18 10583.31 MB/s 18674.44 MB/s 1.76x BenchmarkAddVW/10000-18 10521.15 MB/s 17484.10 MB/s 1.66x BenchmarkAddVW/100000-18 10616.56 MB/s 18084.27 MB/s 1.7x Change-Id: Ic9234c41a43f6c5e9d0e9377de8b4deeefc428a7 Reviewed-on: https://go-review.googlesource.com/32211 Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org> Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>
This commit is contained in:
parent
829aa6732a
commit
1e6b12a201
23
src/math/big/arith_decl_s390x.go
Normal file
23
src/math/big/arith_decl_s390x.go
Normal file
@ -0,0 +1,23 @@
|
||||
// Copyright 2016 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !math_big_pure_go
|
||||
|
||||
package big
|
||||
|
||||
func addVV_check(z, x, y []Word) (c Word)
|
||||
func addVV_vec(z, x, y []Word) (c Word)
|
||||
func addVV_novec(z, x, y []Word) (c Word)
|
||||
func subVV_check(z, x, y []Word) (c Word)
|
||||
func subVV_vec(z, x, y []Word) (c Word)
|
||||
func subVV_novec(z, x, y []Word) (c Word)
|
||||
func addVW_check(z, x []Word, y Word) (c Word)
|
||||
func addVW_vec(z, x []Word, y Word) (c Word)
|
||||
func addVW_novec(z, x []Word, y Word) (c Word)
|
||||
func subVW_check(z, x []Word, y Word) (c Word)
|
||||
func subVW_vec(z, x []Word, y Word) (c Word)
|
||||
func subVW_novec(z, x []Word, y Word) (c Word)
|
||||
func hasVectorFacility() bool
|
||||
|
||||
var hasVX = hasVectorFacility()
|
@ -9,6 +9,26 @@
|
||||
// This file provides fast assembly versions for the elementary
|
||||
// arithmetic operations on vectors implemented in arith.go.
|
||||
|
||||
TEXT ·hasVectorFacility(SB),NOSPLIT,$24-1
|
||||
MOVD $x-24(SP), R1
|
||||
XC $24, 0(R1), 0(R1) // clear the storage
|
||||
MOVD $2, R0 // R0 is the number of double words stored -1
|
||||
WORD $0xB2B01000 // STFLE 0(R1)
|
||||
XOR R0, R0 // reset the value of R0
|
||||
MOVBZ z-8(SP), R1
|
||||
AND $0x40, R1
|
||||
BEQ novector
|
||||
vectorinstalled:
|
||||
// check if the vector instruction has been enabled
|
||||
VLEIB $0, $0xF, V16
|
||||
VLGVB $0, V16, R1
|
||||
CMPBNE R1, $0xF, novector
|
||||
MOVB $1, ret+0(FP) // have vx
|
||||
RET
|
||||
novector:
|
||||
MOVB $0, ret+0(FP) // no vx
|
||||
RET
|
||||
|
||||
TEXT ·mulWW(SB),NOSPLIT,$0
|
||||
MOVD x+0(FP), R3
|
||||
MOVD y+8(FP), R4
|
||||
@ -29,7 +49,31 @@ TEXT ·divWW(SB),NOSPLIT,$0
|
||||
|
||||
// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
|
||||
// func addVV(z, x, y []Word) (c Word)
|
||||
|
||||
|
||||
TEXT ·addVV(SB),NOSPLIT,$0
|
||||
MOVD addvectorfacility+0x00(SB),R1
|
||||
BR (R1)
|
||||
|
||||
TEXT ·addVV_check(SB),NOSPLIT, $0
|
||||
MOVB ·hasVX(SB), R1
|
||||
CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported
|
||||
MOVD $addvectorfacility+0x00(SB), R1
|
||||
MOVD $·addVV_novec(SB), R2
|
||||
MOVD R2, 0(R1)
|
||||
//MOVD $·addVV_novec(SB), 0(R1)
|
||||
BR ·addVV_novec(SB)
|
||||
vectorimpl:
|
||||
MOVD $addvectorfacility+0x00(SB), R1
|
||||
MOVD $·addVV_vec(SB), R2
|
||||
MOVD R2, 0(R1)
|
||||
//MOVD $·addVV_vec(SB), 0(R1)
|
||||
BR ·addVV_vec(SB)
|
||||
|
||||
GLOBL addvectorfacility+0x00(SB), NOPTR, $8
|
||||
DATA addvectorfacility+0x00(SB)/8, $·addVV_check(SB)
|
||||
|
||||
TEXT ·addVV_vec(SB),NOSPLIT,$0
|
||||
MOVD z_len+8(FP), R3
|
||||
MOVD x+24(FP), R8
|
||||
MOVD y+48(FP), R9
|
||||
@ -39,8 +83,96 @@ TEXT ·addVV(SB),NOSPLIT,$0
|
||||
MOVD $0, R0 // make sure it's zero
|
||||
MOVD $0, R10 // i = 0
|
||||
|
||||
|
||||
// s/JL/JMP/ below to disable the unrolled loop
|
||||
SUB $4, R3
|
||||
BLT v1
|
||||
SUB $12, R3 // n -= 16
|
||||
BLT A1 // if n < 0 goto A1
|
||||
|
||||
MOVD R8, R5
|
||||
MOVD R9, R6
|
||||
MOVD R2, R7
|
||||
// n >= 0
|
||||
// regular loop body unrolled 16x
|
||||
VZERO V0 // c = 0
|
||||
UU1: VLM 0(R5), V1, V4 // 64-bytes into V1..V8
|
||||
ADD $64, R5
|
||||
VPDI $0x4,V1,V1,V1 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V2,V2,V2 // flip the doublewords to big-endian order
|
||||
|
||||
|
||||
VLM 0(R6), V9, V12 // 64-bytes into V9..V16
|
||||
ADD $64, R6
|
||||
VPDI $0x4,V9,V9,V9 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V10,V10,V10 // flip the doublewords to big-endian order
|
||||
|
||||
VACCCQ V1, V9, V0, V25
|
||||
VACQ V1, V9, V0, V17
|
||||
VACCCQ V2, V10, V25, V26
|
||||
VACQ V2, V10, V25, V18
|
||||
|
||||
|
||||
VLM 0(R5), V5, V6 // 32-bytes into V1..V8
|
||||
VLM 0(R6), V13, V14 // 32-bytes into V9..V16
|
||||
ADD $32, R5
|
||||
ADD $32, R6
|
||||
|
||||
VPDI $0x4,V3,V3,V3 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V4,V4,V4 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V11,V11,V11 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V12,V12,V12 // flip the doublewords to big-endian order
|
||||
|
||||
VACCCQ V3, V11, V26, V27
|
||||
VACQ V3, V11, V26, V19
|
||||
VACCCQ V4, V12, V27, V28
|
||||
VACQ V4, V12, V27, V20
|
||||
|
||||
VLM 0(R5), V7, V8 // 32-bytes into V1..V8
|
||||
VLM 0(R6), V15, V16 // 32-bytes into V9..V16
|
||||
ADD $32, R5
|
||||
ADD $32, R6
|
||||
|
||||
VPDI $0x4,V5,V5,V5 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V6,V6,V6 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V13,V13,V13 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V14,V14,V14 // flip the doublewords to big-endian order
|
||||
|
||||
VACCCQ V5, V13, V28, V29
|
||||
VACQ V5, V13, V28, V21
|
||||
VACCCQ V6, V14, V29, V30
|
||||
VACQ V6, V14, V29, V22
|
||||
|
||||
VPDI $0x4,V7,V7,V7 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V8,V8,V8 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V15,V15,V15 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V16,V16,V16 // flip the doublewords to big-endian order
|
||||
|
||||
VACCCQ V7, V15, V30, V31
|
||||
VACQ V7, V15, V30, V23
|
||||
VACCCQ V8, V16, V31, V0 //V0 has carry-over
|
||||
VACQ V8, V16, V31, V24
|
||||
|
||||
VPDI $0x4,V17,V17,V17 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V18,V18,V18 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V19,V19,V19 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V20,V20,V20 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V21,V21,V21 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V22,V22,V22 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V23,V23,V23 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V24,V24,V24 // flip the doublewords to big-endian order
|
||||
VSTM V17, V24, 0(R7) // 128-bytes into z
|
||||
ADD $128, R7
|
||||
ADD $128, R10 // i += 16
|
||||
SUB $16, R3 // n -= 16
|
||||
BGE UU1 // if n >= 0 goto U1
|
||||
VLGVG $1, V0, R4 // put cf into R4
|
||||
NEG R4, R4 // save cf
|
||||
|
||||
A1: ADD $12, R3 // n += 16
|
||||
|
||||
|
||||
// s/JL/JMP/ below to disable the unrolled loop
|
||||
SUB $4, R3 // n -= 4
|
||||
BLT v1 // if n < 0 goto v1
|
||||
|
||||
U1: // n >= 0
|
||||
@ -92,10 +224,249 @@ E1: NEG R4, R4
|
||||
MOVD R4, c+72(FP) // return c
|
||||
RET
|
||||
|
||||
TEXT ·addVV_novec(SB),NOSPLIT,$0
|
||||
novec:
|
||||
MOVD z_len+8(FP), R3
|
||||
MOVD x+24(FP), R8
|
||||
MOVD y+48(FP), R9
|
||||
MOVD z+0(FP), R2
|
||||
|
||||
MOVD $0, R4 // c = 0
|
||||
MOVD $0, R0 // make sure it's zero
|
||||
MOVD $0, R10 // i = 0
|
||||
|
||||
// s/JL/JMP/ below to disable the unrolled loop
|
||||
SUB $4, R3 // n -= 4
|
||||
BLT v1n // if n < 0 goto v1n
|
||||
U1n: // n >= 0
|
||||
// regular loop body unrolled 4x
|
||||
MOVD 0(R8)(R10*1), R5
|
||||
MOVD 8(R8)(R10*1), R6
|
||||
MOVD 16(R8)(R10*1), R7
|
||||
MOVD 24(R8)(R10*1), R1
|
||||
ADDC R4, R4 // restore CF
|
||||
MOVD 0(R9)(R10*1), R11
|
||||
ADDE R11, R5
|
||||
MOVD 8(R9)(R10*1), R11
|
||||
ADDE R11, R6
|
||||
MOVD 16(R9)(R10*1), R11
|
||||
ADDE R11, R7
|
||||
MOVD 24(R9)(R10*1), R11
|
||||
ADDE R11, R1
|
||||
MOVD R0, R4
|
||||
ADDE R4, R4 // save CF
|
||||
NEG R4, R4
|
||||
MOVD R5, 0(R2)(R10*1)
|
||||
MOVD R6, 8(R2)(R10*1)
|
||||
MOVD R7, 16(R2)(R10*1)
|
||||
MOVD R1, 24(R2)(R10*1)
|
||||
|
||||
|
||||
ADD $32, R10 // i += 4
|
||||
SUB $4, R3 // n -= 4
|
||||
BGE U1n // if n >= 0 goto U1n
|
||||
|
||||
v1n: ADD $4, R3 // n += 4
|
||||
BLE E1n // if n <= 0 goto E1n
|
||||
|
||||
L1n: // n > 0
|
||||
ADDC R4, R4 // restore CF
|
||||
MOVD 0(R8)(R10*1), R5
|
||||
MOVD 0(R9)(R10*1), R11
|
||||
ADDE R11, R5
|
||||
MOVD R5, 0(R2)(R10*1)
|
||||
MOVD R0, R4
|
||||
ADDE R4, R4 // save CF
|
||||
NEG R4, R4
|
||||
|
||||
ADD $8, R10 // i++
|
||||
SUB $1, R3 // n--
|
||||
BGT L1n // if n > 0 goto L1n
|
||||
|
||||
E1n: NEG R4, R4
|
||||
MOVD R4, c+72(FP) // return c
|
||||
RET
|
||||
|
||||
|
||||
TEXT ·subVV(SB),NOSPLIT,$0
|
||||
MOVD subvectorfacility+0x00(SB),R1
|
||||
BR (R1)
|
||||
|
||||
TEXT ·subVV_check(SB),NOSPLIT,$0
|
||||
MOVB ·hasVX(SB), R1
|
||||
CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported
|
||||
MOVD $subvectorfacility+0x00(SB), R1
|
||||
MOVD $·subVV_novec(SB), R2
|
||||
MOVD R2, 0(R1)
|
||||
//MOVD $·subVV_novec(SB), 0(R1)
|
||||
BR ·subVV_novec(SB)
|
||||
vectorimpl:
|
||||
MOVD $subvectorfacility+0x00(SB), R1
|
||||
MOVD $·subVV_vec(SB), R2
|
||||
MOVD R2, 0(R1)
|
||||
//MOVD $·subVV_vec(SB), 0(R1)
|
||||
BR ·subVV_vec(SB)
|
||||
|
||||
GLOBL subvectorfacility+0x00(SB), NOPTR, $8
|
||||
DATA subvectorfacility+0x00(SB)/8, $·subVV_check(SB)
|
||||
|
||||
// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
|
||||
// func subVV(z, x, y []Word) (c Word)
|
||||
// (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
|
||||
TEXT ·subVV(SB),NOSPLIT,$0
|
||||
TEXT ·subVV_vec(SB),NOSPLIT,$0
|
||||
MOVD z_len+8(FP), R3
|
||||
MOVD x+24(FP), R8
|
||||
MOVD y+48(FP), R9
|
||||
MOVD z+0(FP), R2
|
||||
MOVD $0, R4 // c = 0
|
||||
MOVD $0, R0 // make sure it's zero
|
||||
MOVD $0, R10 // i = 0
|
||||
|
||||
// s/JL/JMP/ below to disable the unrolled loop
|
||||
SUB $4, R3 // n -= 4
|
||||
BLT v1 // if n < 0 goto v1
|
||||
SUB $12, R3 // n -= 16
|
||||
BLT A1 // if n < 0 goto A1
|
||||
|
||||
MOVD R8, R5
|
||||
MOVD R9, R6
|
||||
MOVD R2, R7
|
||||
|
||||
// n >= 0
|
||||
// regular loop body unrolled 16x
|
||||
VZERO V0 // cf = 0
|
||||
MOVD $1, R4 // for 390 subtraction cf starts as 1 (no borrow)
|
||||
VLVGG $1, R4, V0 //put carry into V0
|
||||
|
||||
UU1: VLM 0(R5), V1, V4 // 64-bytes into V1..V8
|
||||
ADD $64, R5
|
||||
VPDI $0x4,V1,V1,V1 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V2,V2,V2 // flip the doublewords to big-endian order
|
||||
|
||||
|
||||
VLM 0(R6), V9, V12 // 64-bytes into V9..V16
|
||||
ADD $64, R6
|
||||
VPDI $0x4,V9,V9,V9 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V10,V10,V10 // flip the doublewords to big-endian order
|
||||
|
||||
VSBCBIQ V1, V9, V0, V25
|
||||
VSBIQ V1, V9, V0, V17
|
||||
VSBCBIQ V2, V10, V25, V26
|
||||
VSBIQ V2, V10, V25, V18
|
||||
|
||||
|
||||
VLM 0(R5), V5, V6 // 32-bytes into V1..V8
|
||||
VLM 0(R6), V13, V14 // 32-bytes into V9..V16
|
||||
ADD $32, R5
|
||||
ADD $32, R6
|
||||
|
||||
VPDI $0x4,V3,V3,V3 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V4,V4,V4 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V11,V11,V11 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V12,V12,V12 // flip the doublewords to big-endian order
|
||||
|
||||
VSBCBIQ V3, V11, V26, V27
|
||||
VSBIQ V3, V11, V26, V19
|
||||
VSBCBIQ V4, V12, V27, V28
|
||||
VSBIQ V4, V12, V27, V20
|
||||
|
||||
VLM 0(R5), V7, V8 // 32-bytes into V1..V8
|
||||
VLM 0(R6), V15, V16 // 32-bytes into V9..V16
|
||||
ADD $32, R5
|
||||
ADD $32, R6
|
||||
|
||||
VPDI $0x4,V5,V5,V5 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V6,V6,V6 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V13,V13,V13 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V14,V14,V14 // flip the doublewords to big-endian order
|
||||
|
||||
VSBCBIQ V5, V13, V28, V29
|
||||
VSBIQ V5, V13, V28, V21
|
||||
VSBCBIQ V6, V14, V29, V30
|
||||
VSBIQ V6, V14, V29, V22
|
||||
|
||||
VPDI $0x4,V7,V7,V7 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V8,V8,V8 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V15,V15,V15 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V16,V16,V16 // flip the doublewords to big-endian order
|
||||
|
||||
VSBCBIQ V7, V15, V30, V31
|
||||
VSBIQ V7, V15, V30, V23
|
||||
VSBCBIQ V8, V16, V31, V0 //V0 has carry-over
|
||||
VSBIQ V8, V16, V31, V24
|
||||
|
||||
VPDI $0x4,V17,V17,V17 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V18,V18,V18 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V19,V19,V19 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V20,V20,V20 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V21,V21,V21 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V22,V22,V22 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V23,V23,V23 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V24,V24,V24 // flip the doublewords to big-endian order
|
||||
VSTM V17, V24, 0(R7) // 128-bytes into z
|
||||
ADD $128, R7
|
||||
ADD $128, R10 // i += 16
|
||||
SUB $16, R3 // n -= 16
|
||||
BGE UU1 // if n >= 0 goto U1
|
||||
VLGVG $1, V0, R4 // put cf into R4
|
||||
SUB $1, R4 // save cf
|
||||
|
||||
A1: ADD $12, R3 // n += 16
|
||||
BLT v1 // if n < 0 goto v1
|
||||
|
||||
U1: // n >= 0
|
||||
// regular loop body unrolled 4x
|
||||
MOVD 0(R8)(R10*1), R5
|
||||
MOVD 8(R8)(R10*1), R6
|
||||
MOVD 16(R8)(R10*1), R7
|
||||
MOVD 24(R8)(R10*1), R1
|
||||
MOVD R0, R11
|
||||
SUBC R4, R11 // restore CF
|
||||
MOVD 0(R9)(R10*1), R11
|
||||
SUBE R11, R5
|
||||
MOVD 8(R9)(R10*1), R11
|
||||
SUBE R11, R6
|
||||
MOVD 16(R9)(R10*1), R11
|
||||
SUBE R11, R7
|
||||
MOVD 24(R9)(R10*1), R11
|
||||
SUBE R11, R1
|
||||
MOVD R0, R4
|
||||
SUBE R4, R4 // save CF
|
||||
MOVD R5, 0(R2)(R10*1)
|
||||
MOVD R6, 8(R2)(R10*1)
|
||||
MOVD R7, 16(R2)(R10*1)
|
||||
MOVD R1, 24(R2)(R10*1)
|
||||
|
||||
ADD $32, R10 // i += 4
|
||||
SUB $4, R3 // n -= 4
|
||||
BGE U1 // if n >= 0 goto U1n
|
||||
|
||||
v1: ADD $4, R3 // n += 4
|
||||
BLE E1 // if n <= 0 goto E1
|
||||
|
||||
L1: // n > 0
|
||||
MOVD R0, R11
|
||||
SUBC R4, R11 // restore CF
|
||||
MOVD 0(R8)(R10*1), R5
|
||||
MOVD 0(R9)(R10*1), R11
|
||||
SUBE R11, R5
|
||||
MOVD R5, 0(R2)(R10*1)
|
||||
MOVD R0, R4
|
||||
SUBE R4, R4 // save CF
|
||||
|
||||
ADD $8, R10 // i++
|
||||
SUB $1, R3 // n--
|
||||
BGT L1 // if n > 0 goto L1n
|
||||
|
||||
E1: NEG R4, R4
|
||||
MOVD R4, c+72(FP) // return c
|
||||
RET
|
||||
|
||||
|
||||
// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
|
||||
// func subVV(z, x, y []Word) (c Word)
|
||||
// (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
|
||||
TEXT ·subVV_novec(SB),NOSPLIT,$0
|
||||
MOVD z_len+8(FP), R3
|
||||
MOVD x+24(FP), R8
|
||||
MOVD y+48(FP), R9
|
||||
@ -158,9 +529,163 @@ E1: NEG R4, R4
|
||||
MOVD R4, c+72(FP) // return c
|
||||
RET
|
||||
|
||||
|
||||
// func addVW(z, x []Word, y Word) (c Word)
|
||||
TEXT ·addVW(SB),NOSPLIT,$0
|
||||
MOVD addwvectorfacility+0x00(SB),R1
|
||||
BR (R1)
|
||||
|
||||
TEXT ·addVW_check(SB),NOSPLIT,$0
|
||||
MOVB ·hasVX(SB), R1
|
||||
CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported
|
||||
MOVD $addwvectorfacility+0x00(SB), R1
|
||||
MOVD $·addVW_novec(SB), R2
|
||||
MOVD R2, 0(R1)
|
||||
//MOVD $·addVW_novec(SB), 0(R1)
|
||||
BR ·addVW_novec(SB)
|
||||
vectorimpl:
|
||||
MOVD $addwvectorfacility+0x00(SB), R1
|
||||
MOVD $·addVW_vec(SB), R2
|
||||
MOVD R2, 0(R1)
|
||||
//MOVD $·addVW_vec(SB), 0(R1)
|
||||
BR ·addVW_vec(SB)
|
||||
|
||||
GLOBL addwvectorfacility+0x00(SB), NOPTR, $8
|
||||
DATA addwvectorfacility+0x00(SB)/8, $·addVW_check(SB)
|
||||
|
||||
|
||||
// func addVW_vec(z, x []Word, y Word) (c Word)
|
||||
TEXT ·addVW_vec(SB),NOSPLIT,$0
|
||||
MOVD z_len+8(FP), R3
|
||||
MOVD x+24(FP), R8
|
||||
MOVD y+48(FP), R4 // c = y
|
||||
MOVD z+0(FP), R2
|
||||
|
||||
MOVD $0, R0 // make sure it's zero
|
||||
MOVD $0, R10 // i = 0
|
||||
MOVD R8, R5
|
||||
MOVD R2, R7
|
||||
|
||||
// s/JL/JMP/ below to disable the unrolled loop
|
||||
SUB $4, R3 // n -= 4
|
||||
BLT v10 // if n < 0 goto v10
|
||||
SUB $12, R3
|
||||
BLT A10
|
||||
|
||||
// n >= 0
|
||||
// regular loop body unrolled 16x
|
||||
|
||||
VZERO V0 // prepare V0 to be final carry register
|
||||
VZERO V9 // to ensure upper half is zero
|
||||
VLVGG $1, R4, V9
|
||||
UU1: VLM 0(R5), V1, V4 // 64-bytes into V1..V4
|
||||
ADD $64, R5
|
||||
VPDI $0x4,V1,V1,V1 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V2,V2,V2 // flip the doublewords to big-endian order
|
||||
|
||||
|
||||
VACCCQ V1, V9, V0, V25
|
||||
VACQ V1, V9, V0, V17
|
||||
VZERO V9
|
||||
VACCCQ V2, V9, V25, V26
|
||||
VACQ V2, V9, V25, V18
|
||||
|
||||
|
||||
VLM 0(R5), V5, V6 // 32-bytes into V5..V6
|
||||
ADD $32, R5
|
||||
|
||||
VPDI $0x4,V3,V3,V3 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V4,V4,V4 // flip the doublewords to big-endian order
|
||||
|
||||
VACCCQ V3, V9, V26, V27
|
||||
VACQ V3, V9, V26, V19
|
||||
VACCCQ V4, V9, V27, V28
|
||||
VACQ V4, V9, V27, V20
|
||||
|
||||
VLM 0(R5), V7, V8 // 32-bytes into V7..V8
|
||||
ADD $32, R5
|
||||
|
||||
VPDI $0x4,V5,V5,V5 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V6,V6,V6 // flip the doublewords to big-endian order
|
||||
|
||||
VACCCQ V5, V9, V28, V29
|
||||
VACQ V5, V9, V28, V21
|
||||
VACCCQ V6, V9, V29, V30
|
||||
VACQ V6, V9, V29, V22
|
||||
|
||||
VPDI $0x4,V7,V7,V7 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V8,V8,V8 // flip the doublewords to big-endian order
|
||||
|
||||
VACCCQ V7, V9, V30, V31
|
||||
VACQ V7, V9, V30, V23
|
||||
VACCCQ V8, V9, V31, V0 //V0 has carry-over
|
||||
VACQ V8, V9, V31, V24
|
||||
|
||||
VPDI $0x4,V17,V17,V17 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V18,V18,V18 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V19,V19,V19 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V20,V20,V20 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V21,V21,V21 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V22,V22,V22 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V23,V23,V23 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V24,V24,V24 // flip the doublewords to big-endian order
|
||||
VSTM V17, V24, 0(R7) // 128-bytes into z
|
||||
ADD $128, R7
|
||||
ADD $128, R10 // i += 16
|
||||
SUB $16, R3 // n -= 16
|
||||
BGE UU1 // if n >= 0 goto U1
|
||||
VLGVG $1, V0, R4 // put cf into R4 in case we branch to v10
|
||||
|
||||
A10: ADD $12, R3 // n += 16
|
||||
|
||||
|
||||
// s/JL/JMP/ below to disable the unrolled loop
|
||||
|
||||
BLT v10 // if n < 0 goto v10
|
||||
|
||||
|
||||
U4: // n >= 0
|
||||
// regular loop body unrolled 4x
|
||||
MOVD 0(R8)(R10*1), R5
|
||||
MOVD 8(R8)(R10*1), R6
|
||||
MOVD 16(R8)(R10*1), R7
|
||||
MOVD 24(R8)(R10*1), R1
|
||||
ADDC R4, R5
|
||||
ADDE R0, R6
|
||||
ADDE R0, R7
|
||||
ADDE R0, R1
|
||||
ADDE R0, R0
|
||||
MOVD R0, R4 // save CF
|
||||
SUB R0, R0
|
||||
MOVD R5, 0(R2)(R10*1)
|
||||
MOVD R6, 8(R2)(R10*1)
|
||||
MOVD R7, 16(R2)(R10*1)
|
||||
MOVD R1, 24(R2)(R10*1)
|
||||
|
||||
ADD $32, R10 // i += 4 -> i +=32
|
||||
SUB $4, R3 // n -= 4
|
||||
BGE U4 // if n >= 0 goto U4
|
||||
|
||||
v10: ADD $4, R3 // n += 4
|
||||
BLE E10 // if n <= 0 goto E4
|
||||
|
||||
|
||||
L4: // n > 0
|
||||
MOVD 0(R8)(R10*1), R5
|
||||
ADDC R4, R5
|
||||
ADDE R0, R0
|
||||
MOVD R0, R4 // save CF
|
||||
SUB R0, R0
|
||||
MOVD R5, 0(R2)(R10*1)
|
||||
|
||||
ADD $8, R10 // i++
|
||||
SUB $1, R3 // n--
|
||||
BGT L4 // if n > 0 goto L4
|
||||
|
||||
E10: MOVD R4, c+56(FP) // return c
|
||||
|
||||
RET
|
||||
|
||||
|
||||
TEXT ·addVW_novec(SB),NOSPLIT,$0
|
||||
//DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0)
|
||||
MOVD z_len+8(FP), R3
|
||||
MOVD x+24(FP), R8
|
||||
@ -214,10 +739,166 @@ E4: MOVD R4, c+56(FP) // return c
|
||||
|
||||
RET
|
||||
|
||||
TEXT ·subVW(SB),NOSPLIT,$0
|
||||
MOVD subwvectorfacility+0x00(SB),R1
|
||||
BR (R1)
|
||||
|
||||
TEXT ·subVW_check(SB),NOSPLIT,$0
|
||||
MOVB ·hasVX(SB), R1
|
||||
CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported
|
||||
MOVD $subwvectorfacility+0x00(SB), R1
|
||||
MOVD $·subVW_novec(SB), R2
|
||||
MOVD R2, 0(R1)
|
||||
//MOVD $·subVW_novec(SB), 0(R1)
|
||||
BR ·subVW_novec(SB)
|
||||
vectorimpl:
|
||||
MOVD $subwvectorfacility+0x00(SB), R1
|
||||
MOVD $·subVW_vec(SB), R2
|
||||
MOVD R2, 0(R1)
|
||||
//MOVD $·subVW_vec(SB), 0(R1)
|
||||
BR ·subVW_vec(SB)
|
||||
|
||||
GLOBL subwvectorfacility+0x00(SB), NOPTR, $8
|
||||
DATA subwvectorfacility+0x00(SB)/8, $·subVW_check(SB)
|
||||
|
||||
// func subVW(z, x []Word, y Word) (c Word)
|
||||
TEXT ·subVW_vec(SB),NOSPLIT,$0
|
||||
MOVD z_len+8(FP), R3
|
||||
MOVD x+24(FP), R8
|
||||
MOVD y+48(FP), R4 // c = y
|
||||
MOVD z+0(FP), R2
|
||||
|
||||
MOVD $0, R0 // make sure it's zero
|
||||
MOVD $0, R10 // i = 0
|
||||
MOVD R8, R5
|
||||
MOVD R2, R7
|
||||
|
||||
// s/JL/JMP/ below to disable the unrolled loop
|
||||
SUB $4, R3 // n -= 4
|
||||
BLT v11 // if n < 0 goto v11
|
||||
SUB $12, R3
|
||||
BLT A11
|
||||
|
||||
VZERO V0
|
||||
MOVD $1, R6 // prepare V0 to be final carry register
|
||||
VLVGG $1, R6, V0 // borrow is initially "no borrow"
|
||||
VZERO V9 // to ensure upper half is zero
|
||||
VLVGG $1, R4, V9
|
||||
|
||||
// n >= 0
|
||||
// regular loop body unrolled 16x
|
||||
|
||||
|
||||
UU1: VLM 0(R5), V1, V4 // 64-bytes into V1..V4
|
||||
ADD $64, R5
|
||||
VPDI $0x4,V1,V1,V1 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V2,V2,V2 // flip the doublewords to big-endian order
|
||||
|
||||
|
||||
VSBCBIQ V1, V9, V0, V25
|
||||
VSBIQ V1, V9, V0, V17
|
||||
VZERO V9
|
||||
VSBCBIQ V2, V9, V25, V26
|
||||
VSBIQ V2, V9, V25, V18
|
||||
|
||||
VLM 0(R5), V5, V6 // 32-bytes into V5..V6
|
||||
ADD $32, R5
|
||||
|
||||
VPDI $0x4,V3,V3,V3 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V4,V4,V4 // flip the doublewords to big-endian order
|
||||
|
||||
|
||||
VSBCBIQ V3, V9, V26, V27
|
||||
VSBIQ V3, V9, V26, V19
|
||||
VSBCBIQ V4, V9, V27, V28
|
||||
VSBIQ V4, V9, V27, V20
|
||||
|
||||
VLM 0(R5), V7, V8 // 32-bytes into V7..V8
|
||||
ADD $32, R5
|
||||
|
||||
VPDI $0x4,V5,V5,V5 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V6,V6,V6 // flip the doublewords to big-endian order
|
||||
|
||||
VSBCBIQ V5, V9, V28, V29
|
||||
VSBIQ V5, V9, V28, V21
|
||||
VSBCBIQ V6, V9, V29, V30
|
||||
VSBIQ V6, V9, V29, V22
|
||||
|
||||
VPDI $0x4,V7,V7,V7 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V8,V8,V8 // flip the doublewords to big-endian order
|
||||
|
||||
VSBCBIQ V7, V9, V30, V31
|
||||
VSBIQ V7, V9, V30, V23
|
||||
VSBCBIQ V8, V9, V31, V0 // V0 has carry-over
|
||||
VSBIQ V8, V9, V31, V24
|
||||
|
||||
VPDI $0x4,V17,V17,V17 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V18,V18,V18 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V19,V19,V19 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V20,V20,V20 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V21,V21,V21 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V22,V22,V22 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V23,V23,V23 // flip the doublewords to big-endian order
|
||||
VPDI $0x4,V24,V24,V24 // flip the doublewords to big-endian order
|
||||
VSTM V17, V24, 0(R7) // 128-bytes into z
|
||||
ADD $128, R7
|
||||
ADD $128, R10 // i += 16
|
||||
SUB $16, R3 // n -= 16
|
||||
BGE UU1 // if n >= 0 goto U1
|
||||
VLGVG $1, V0, R4 // put cf into R4 in case we branch to v10
|
||||
SUB $1, R4 // save cf
|
||||
NEG R4, R4
|
||||
A11: ADD $12, R3 // n += 16
|
||||
|
||||
BLT v11 // if n < 0 goto v11
|
||||
|
||||
// n >= 0
|
||||
// regular loop body unrolled 4x
|
||||
|
||||
U4: // n >= 0
|
||||
// regular loop body unrolled 4x
|
||||
MOVD 0(R8)(R10*1), R5
|
||||
MOVD 8(R8)(R10*1), R6
|
||||
MOVD 16(R8)(R10*1), R7
|
||||
MOVD 24(R8)(R10*1), R1
|
||||
SUBC R4, R5 //SLGR -> SUBC
|
||||
SUBE R0, R6 //SLBGR -> SUBE
|
||||
SUBE R0, R7
|
||||
SUBE R0, R1
|
||||
SUBE R4, R4 // save CF
|
||||
NEG R4, R4
|
||||
MOVD R5, 0(R2)(R10*1)
|
||||
MOVD R6, 8(R2)(R10*1)
|
||||
MOVD R7, 16(R2)(R10*1)
|
||||
MOVD R1, 24(R2)(R10*1)
|
||||
|
||||
ADD $32, R10 // i += 4 -> i +=32
|
||||
SUB $4, R3 // n -= 4
|
||||
BGE U4 // if n >= 0 goto U4
|
||||
|
||||
v11: ADD $4, R3 // n += 4
|
||||
BLE E11 // if n <= 0 goto E4
|
||||
|
||||
L4: // n > 0
|
||||
|
||||
MOVD 0(R8)(R10*1), R5
|
||||
SUBC R4, R5
|
||||
SUBE R4, R4 // save CF
|
||||
NEG R4, R4
|
||||
MOVD R5, 0(R2)(R10*1)
|
||||
|
||||
ADD $8, R10 // i++
|
||||
SUB $1, R3 // n--
|
||||
BGT L4 // if n > 0 goto L4
|
||||
|
||||
E11: MOVD R4, c+56(FP) // return c
|
||||
|
||||
RET
|
||||
|
||||
//DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0)
|
||||
// func subVW(z, x []Word, y Word) (c Word)
|
||||
// (same as addVW except for SUBC/SUBE instead of ADDC/ADDE and label names)
|
||||
TEXT ·subVW(SB),NOSPLIT,$0
|
||||
TEXT ·subVW_novec(SB),NOSPLIT,$0
|
||||
MOVD z_len+8(FP), R3
|
||||
MOVD x+24(FP), R8
|
||||
MOVD y+48(FP), R4 // c = y
|
||||
@ -565,3 +1246,4 @@ TEXT ·bitLen(SB),NOSPLIT,$0
|
||||
SUB R2, R3
|
||||
MOVD R3, n+8(FP)
|
||||
RET
|
||||
|
||||
|
44
src/math/big/arith_s390x_test.go
Normal file
44
src/math/big/arith_s390x_test.go
Normal file
@ -0,0 +1,44 @@
|
||||
// Copyright 2016 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build s390x !math_big_pure_go
|
||||
|
||||
package big
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
// Tests whether the non vector routines are working, even when the tests are run on a
|
||||
// vector-capable machine
|
||||
|
||||
func TestFunVVnovec(t *testing.T) {
|
||||
if hasVX == true {
|
||||
for _, a := range sumVV {
|
||||
arg := a
|
||||
testFunVV(t, "addVV_novec", addVV_novec, arg)
|
||||
|
||||
arg = argVV{a.z, a.y, a.x, a.c}
|
||||
testFunVV(t, "addVV_novec symmetric", addVV_novec, arg)
|
||||
|
||||
arg = argVV{a.x, a.z, a.y, a.c}
|
||||
testFunVV(t, "subVV_novec", subVV_novec, arg)
|
||||
|
||||
arg = argVV{a.y, a.z, a.x, a.c}
|
||||
testFunVV(t, "subVV_novec symmetric", subVV_novec, arg)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestFunVWnovec(t *testing.T) {
|
||||
if hasVX == true {
|
||||
for _, a := range sumVW {
|
||||
arg := a
|
||||
testFunVW(t, "addVW_novec", addVW_novec, arg)
|
||||
|
||||
arg = argVW{a.x, a.z, a.y, a.c}
|
||||
testFunVW(t, "subVW_novec", subVW_novec, arg)
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user