1
0
mirror of https://github.com/golang/go synced 2024-11-14 08:50:22 -07:00
go/src/math/cosh_s390x.s

228 lines
6.1 KiB
ArmAsm
Raw Normal View History

math: use SIMD to accelerate some scalar math functions on s390x Note, most math functions are structured to use stubs, so that they can be accelerated with assembly on any platform. Sinh, cosh, and tanh were not structued with stubs, so this CL does that. This set of routines was chosen as likely to produce good speedups with assembly on any platform. Technique used was minimax polynomial approximation using tables of polynomial coefficients, with argument range reduction. A table of scaling factors was also used for cosh and log10. before after speedup BenchmarkCos 22.1 ns/op 6.79 ns/op 3.25x BenchmarkCosh 125 ns/op 11.7 ns/op 10.68x BenchmarkLog10 48.4 ns/op 12.5 ns/op 3.87x BenchmarkSin 22.2 ns/op 6.55 ns/op 3.39x BenchmarkSinh 125 ns/op 14.2 ns/op 8.80x BenchmarkTanh 65.0 ns/op 15.1 ns/op 4.30x Accuracy was tested against a high precision reference function to determine maximum error. Approximately 4,000,000 points were tested for each function, producing the following result. Note: ulperr is error in "units in the last place" max ulperr sin 1.43 (returns NaN beyond +-2^50) cos 1.79 (returns NaN beyond +-2^50) cosh 1.05 sinh 3.02 tanh 3.69 log10 1.75 Also includes a set of tests to test non-vector functions even when SIMD is enabled Change-Id: Icb45f14d00864ee19ed973d209c3af21e4df4edc Reviewed-on: https://go-review.googlesource.com/32352 Run-TryBot: Michael Munday <munday@ca.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Munday <munday@ca.ibm.com>
2016-10-29 22:11:37 -06:00
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "textflag.h"
// Constants
DATA coshrodataL23<>+0(SB)/8, $0.231904681384629956E-16
DATA coshrodataL23<>+8(SB)/8, $0.693147180559945286E+00
DATA coshrodataL23<>+16(SB)/8, $0.144269504088896339E+01
DATA coshrodataL23<>+24(SB)/8, $704.E0
GLOBL coshrodataL23<>+0(SB), RODATA, $32
DATA coshxinf<>+0(SB)/8, $0x7FF0000000000000
GLOBL coshxinf<>+0(SB), RODATA, $8
DATA coshxlim1<>+0(SB)/8, $800.E0
GLOBL coshxlim1<>+0(SB), RODATA, $8
DATA coshxaddhy<>+0(SB)/8, $0xc2f0000100003fdf
GLOBL coshxaddhy<>+0(SB), RODATA, $8
DATA coshx4ff<>+0(SB)/8, $0x4ff0000000000000
GLOBL coshx4ff<>+0(SB), RODATA, $8
DATA coshe1<>+0(SB)/8, $0x3ff000000000000a
GLOBL coshe1<>+0(SB), RODATA, $8
// Log multiplier table
DATA coshtab<>+0(SB)/8, $0.442737824274138381E-01
DATA coshtab<>+8(SB)/8, $0.263602189790660309E-01
DATA coshtab<>+16(SB)/8, $0.122565642281703586E-01
DATA coshtab<>+24(SB)/8, $0.143757052860721398E-02
DATA coshtab<>+32(SB)/8, $-.651375034121276075E-02
DATA coshtab<>+40(SB)/8, $-.119317678849450159E-01
DATA coshtab<>+48(SB)/8, $-.150868749549871069E-01
DATA coshtab<>+56(SB)/8, $-.161992609578469234E-01
DATA coshtab<>+64(SB)/8, $-.154492360403337917E-01
DATA coshtab<>+72(SB)/8, $-.129850717389178721E-01
DATA coshtab<>+80(SB)/8, $-.892902649276657891E-02
DATA coshtab<>+88(SB)/8, $-.338202636596794887E-02
DATA coshtab<>+96(SB)/8, $0.357266307045684762E-02
DATA coshtab<>+104(SB)/8, $0.118665304327406698E-01
DATA coshtab<>+112(SB)/8, $0.214434994118118914E-01
DATA coshtab<>+120(SB)/8, $0.322580645161290314E-01
GLOBL coshtab<>+0(SB), RODATA, $128
// Minimax polynomial approximations
DATA coshe2<>+0(SB)/8, $0.500000000000004237e+00
GLOBL coshe2<>+0(SB), RODATA, $8
DATA coshe3<>+0(SB)/8, $0.166666666630345592e+00
GLOBL coshe3<>+0(SB), RODATA, $8
DATA coshe4<>+0(SB)/8, $0.416666664838056960e-01
GLOBL coshe4<>+0(SB), RODATA, $8
DATA coshe5<>+0(SB)/8, $0.833349307718286047e-02
GLOBL coshe5<>+0(SB), RODATA, $8
DATA coshe6<>+0(SB)/8, $0.138926439368309441e-02
GLOBL coshe6<>+0(SB), RODATA, $8
// Cosh returns the hyperbolic cosine of x.
//
// Special cases are:
// Cosh(±0) = 1
// Cosh(±Inf) = +Inf
// Cosh(NaN) = NaN
// The algorithm used is minimax polynomial approximation
// with coefficients determined with a Remez exchange algorithm.
TEXT ·coshAsm(SB),NOSPLIT,$0-16
FMOVD x+0(FP), F0
MOVD $coshrodataL23<>+0(SB), R9
WORD $0xB3120000 //ltdbr %f0,%f0
MOVD $0x4086000000000000, R2
MOVD $0x4086000000000000, R3
BLTU L19
FMOVD F0, F4
L2:
WORD $0xED409018 //cdb %f4,.L24-.L23(%r9)
BYTE $0x00
BYTE $0x19
BGE L14 //jnl .L14
BVS L14
WFCEDBS V4, V4, V2
BEQ L20
L1:
FMOVD F0, ret+8(FP)
RET
L14:
WFCEDBS V4, V4, V2
BVS L1
MOVD $coshxlim1<>+0(SB), R1
FMOVD 0(R1), F2
WFCHEDBS V4, V2, V2
BEQ L21
MOVD $coshxaddhy<>+0(SB), R1
FMOVD coshrodataL23<>+16(SB), F5
FMOVD 0(R1), F2
WFMSDB V0, V5, V2, V5
FMOVD coshrodataL23<>+8(SB), F3
FADD F5, F2
MOVD $coshe6<>+0(SB), R1
WFMSDB V2, V3, V0, V3
FMOVD 0(R1), F6
WFMDB V3, V3, V1
MOVD $coshe4<>+0(SB), R1
FMOVD coshrodataL23<>+0(SB), F7
WFMADB V2, V7, V3, V2
FMOVD 0(R1), F3
MOVD $coshe5<>+0(SB), R1
WFMADB V1, V6, V3, V6
FMOVD 0(R1), F7
MOVD $coshe3<>+0(SB), R1
FMOVD 0(R1), F3
WFMADB V1, V7, V3, V7
FNEG F2, F3
LGDR F5, R1
math: use SIMD to accelerate some scalar math functions on s390x Note, most math functions are structured to use stubs, so that they can be accelerated with assembly on any platform. Sinh, cosh, and tanh were not structued with stubs, so this CL does that. This set of routines was chosen as likely to produce good speedups with assembly on any platform. Technique used was minimax polynomial approximation using tables of polynomial coefficients, with argument range reduction. A table of scaling factors was also used for cosh and log10. before after speedup BenchmarkCos 22.1 ns/op 6.79 ns/op 3.25x BenchmarkCosh 125 ns/op 11.7 ns/op 10.68x BenchmarkLog10 48.4 ns/op 12.5 ns/op 3.87x BenchmarkSin 22.2 ns/op 6.55 ns/op 3.39x BenchmarkSinh 125 ns/op 14.2 ns/op 8.80x BenchmarkTanh 65.0 ns/op 15.1 ns/op 4.30x Accuracy was tested against a high precision reference function to determine maximum error. Approximately 4,000,000 points were tested for each function, producing the following result. Note: ulperr is error in "units in the last place" max ulperr sin 1.43 (returns NaN beyond +-2^50) cos 1.79 (returns NaN beyond +-2^50) cosh 1.05 sinh 3.02 tanh 3.69 log10 1.75 Also includes a set of tests to test non-vector functions even when SIMD is enabled Change-Id: Icb45f14d00864ee19ed973d209c3af21e4df4edc Reviewed-on: https://go-review.googlesource.com/32352 Run-TryBot: Michael Munday <munday@ca.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Munday <munday@ca.ibm.com>
2016-10-29 22:11:37 -06:00
MOVD $coshe2<>+0(SB), R3
WFCEDBS V4, V0, V0
FMOVD 0(R3), F5
MOVD $coshe1<>+0(SB), R3
WFMADB V1, V6, V5, V6
FMOVD 0(R3), F5
WORD $0xEC21000F //risbgn %r2,%r1,64-64+0,64-64+0+16-1,64-0-16
BYTE $0x30
BYTE $0x59
WFMADB V1, V7, V5, V1
BVS L22
WORD $0xEC4139BC //risbg %r4,%r1,57,128+60,3
BYTE $0x03
BYTE $0x55
MOVD $coshtab<>+0(SB), R3
WFMADB V3, V6, V1, V6
WORD $0x68043000 //ld %f0,0(%r4,%r3)
FMSUB F0, F3, F2
math: use SIMD to accelerate some scalar math functions on s390x Note, most math functions are structured to use stubs, so that they can be accelerated with assembly on any platform. Sinh, cosh, and tanh were not structued with stubs, so this CL does that. This set of routines was chosen as likely to produce good speedups with assembly on any platform. Technique used was minimax polynomial approximation using tables of polynomial coefficients, with argument range reduction. A table of scaling factors was also used for cosh and log10. before after speedup BenchmarkCos 22.1 ns/op 6.79 ns/op 3.25x BenchmarkCosh 125 ns/op 11.7 ns/op 10.68x BenchmarkLog10 48.4 ns/op 12.5 ns/op 3.87x BenchmarkSin 22.2 ns/op 6.55 ns/op 3.39x BenchmarkSinh 125 ns/op 14.2 ns/op 8.80x BenchmarkTanh 65.0 ns/op 15.1 ns/op 4.30x Accuracy was tested against a high precision reference function to determine maximum error. Approximately 4,000,000 points were tested for each function, producing the following result. Note: ulperr is error in "units in the last place" max ulperr sin 1.43 (returns NaN beyond +-2^50) cos 1.79 (returns NaN beyond +-2^50) cosh 1.05 sinh 3.02 tanh 3.69 log10 1.75 Also includes a set of tests to test non-vector functions even when SIMD is enabled Change-Id: Icb45f14d00864ee19ed973d209c3af21e4df4edc Reviewed-on: https://go-review.googlesource.com/32352 Run-TryBot: Michael Munday <munday@ca.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Munday <munday@ca.ibm.com>
2016-10-29 22:11:37 -06:00
WORD $0xA71AF000 //ahi %r1,-4096
WFMADB V2, V6, V0, V6
L17:
WORD $0xEC21000F //risbgn %r2,%r1,64-64+0,64-64+0+16-1,64-0-16
BYTE $0x30
BYTE $0x59
LDGR R2, F2
FMADD F2, F6, F2
math: use SIMD to accelerate some scalar math functions on s390x Note, most math functions are structured to use stubs, so that they can be accelerated with assembly on any platform. Sinh, cosh, and tanh were not structued with stubs, so this CL does that. This set of routines was chosen as likely to produce good speedups with assembly on any platform. Technique used was minimax polynomial approximation using tables of polynomial coefficients, with argument range reduction. A table of scaling factors was also used for cosh and log10. before after speedup BenchmarkCos 22.1 ns/op 6.79 ns/op 3.25x BenchmarkCosh 125 ns/op 11.7 ns/op 10.68x BenchmarkLog10 48.4 ns/op 12.5 ns/op 3.87x BenchmarkSin 22.2 ns/op 6.55 ns/op 3.39x BenchmarkSinh 125 ns/op 14.2 ns/op 8.80x BenchmarkTanh 65.0 ns/op 15.1 ns/op 4.30x Accuracy was tested against a high precision reference function to determine maximum error. Approximately 4,000,000 points were tested for each function, producing the following result. Note: ulperr is error in "units in the last place" max ulperr sin 1.43 (returns NaN beyond +-2^50) cos 1.79 (returns NaN beyond +-2^50) cosh 1.05 sinh 3.02 tanh 3.69 log10 1.75 Also includes a set of tests to test non-vector functions even when SIMD is enabled Change-Id: Icb45f14d00864ee19ed973d209c3af21e4df4edc Reviewed-on: https://go-review.googlesource.com/32352 Run-TryBot: Michael Munday <munday@ca.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Munday <munday@ca.ibm.com>
2016-10-29 22:11:37 -06:00
MOVD $coshx4ff<>+0(SB), R1
FMOVD 0(R1), F0
FMUL F2, F0
FMOVD F0, ret+8(FP)
RET
L19:
FNEG F0, F4
BR L2
L20:
MOVD $coshxaddhy<>+0(SB), R1
FMOVD coshrodataL23<>+16(SB), F3
FMOVD 0(R1), F2
WFMSDB V0, V3, V2, V3
FMOVD coshrodataL23<>+8(SB), F4
FADD F3, F2
MOVD $coshe6<>+0(SB), R1
FMSUB F4, F2, F0
math: use SIMD to accelerate some scalar math functions on s390x Note, most math functions are structured to use stubs, so that they can be accelerated with assembly on any platform. Sinh, cosh, and tanh were not structued with stubs, so this CL does that. This set of routines was chosen as likely to produce good speedups with assembly on any platform. Technique used was minimax polynomial approximation using tables of polynomial coefficients, with argument range reduction. A table of scaling factors was also used for cosh and log10. before after speedup BenchmarkCos 22.1 ns/op 6.79 ns/op 3.25x BenchmarkCosh 125 ns/op 11.7 ns/op 10.68x BenchmarkLog10 48.4 ns/op 12.5 ns/op 3.87x BenchmarkSin 22.2 ns/op 6.55 ns/op 3.39x BenchmarkSinh 125 ns/op 14.2 ns/op 8.80x BenchmarkTanh 65.0 ns/op 15.1 ns/op 4.30x Accuracy was tested against a high precision reference function to determine maximum error. Approximately 4,000,000 points were tested for each function, producing the following result. Note: ulperr is error in "units in the last place" max ulperr sin 1.43 (returns NaN beyond +-2^50) cos 1.79 (returns NaN beyond +-2^50) cosh 1.05 sinh 3.02 tanh 3.69 log10 1.75 Also includes a set of tests to test non-vector functions even when SIMD is enabled Change-Id: Icb45f14d00864ee19ed973d209c3af21e4df4edc Reviewed-on: https://go-review.googlesource.com/32352 Run-TryBot: Michael Munday <munday@ca.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Munday <munday@ca.ibm.com>
2016-10-29 22:11:37 -06:00
FMOVD 0(R1), F6
WFMDB V0, V0, V1
MOVD $coshe4<>+0(SB), R1
FMOVD 0(R1), F4
MOVD $coshe5<>+0(SB), R1
FMOVD coshrodataL23<>+0(SB), F5
WFMADB V1, V6, V4, V6
FMADD F5, F2, F0
math: use SIMD to accelerate some scalar math functions on s390x Note, most math functions are structured to use stubs, so that they can be accelerated with assembly on any platform. Sinh, cosh, and tanh were not structued with stubs, so this CL does that. This set of routines was chosen as likely to produce good speedups with assembly on any platform. Technique used was minimax polynomial approximation using tables of polynomial coefficients, with argument range reduction. A table of scaling factors was also used for cosh and log10. before after speedup BenchmarkCos 22.1 ns/op 6.79 ns/op 3.25x BenchmarkCosh 125 ns/op 11.7 ns/op 10.68x BenchmarkLog10 48.4 ns/op 12.5 ns/op 3.87x BenchmarkSin 22.2 ns/op 6.55 ns/op 3.39x BenchmarkSinh 125 ns/op 14.2 ns/op 8.80x BenchmarkTanh 65.0 ns/op 15.1 ns/op 4.30x Accuracy was tested against a high precision reference function to determine maximum error. Approximately 4,000,000 points were tested for each function, producing the following result. Note: ulperr is error in "units in the last place" max ulperr sin 1.43 (returns NaN beyond +-2^50) cos 1.79 (returns NaN beyond +-2^50) cosh 1.05 sinh 3.02 tanh 3.69 log10 1.75 Also includes a set of tests to test non-vector functions even when SIMD is enabled Change-Id: Icb45f14d00864ee19ed973d209c3af21e4df4edc Reviewed-on: https://go-review.googlesource.com/32352 Run-TryBot: Michael Munday <munday@ca.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Munday <munday@ca.ibm.com>
2016-10-29 22:11:37 -06:00
FMOVD 0(R1), F2
MOVD $coshe3<>+0(SB), R1
FMOVD 0(R1), F4
WFMADB V1, V2, V4, V2
MOVD $coshe2<>+0(SB), R1
FMOVD 0(R1), F5
FNEG F0, F4
WFMADB V1, V6, V5, V6
MOVD $coshe1<>+0(SB), R1
FMOVD 0(R1), F5
WFMADB V1, V2, V5, V1
LGDR F3, R1
math: use SIMD to accelerate some scalar math functions on s390x Note, most math functions are structured to use stubs, so that they can be accelerated with assembly on any platform. Sinh, cosh, and tanh were not structued with stubs, so this CL does that. This set of routines was chosen as likely to produce good speedups with assembly on any platform. Technique used was minimax polynomial approximation using tables of polynomial coefficients, with argument range reduction. A table of scaling factors was also used for cosh and log10. before after speedup BenchmarkCos 22.1 ns/op 6.79 ns/op 3.25x BenchmarkCosh 125 ns/op 11.7 ns/op 10.68x BenchmarkLog10 48.4 ns/op 12.5 ns/op 3.87x BenchmarkSin 22.2 ns/op 6.55 ns/op 3.39x BenchmarkSinh 125 ns/op 14.2 ns/op 8.80x BenchmarkTanh 65.0 ns/op 15.1 ns/op 4.30x Accuracy was tested against a high precision reference function to determine maximum error. Approximately 4,000,000 points were tested for each function, producing the following result. Note: ulperr is error in "units in the last place" max ulperr sin 1.43 (returns NaN beyond +-2^50) cos 1.79 (returns NaN beyond +-2^50) cosh 1.05 sinh 3.02 tanh 3.69 log10 1.75 Also includes a set of tests to test non-vector functions even when SIMD is enabled Change-Id: Icb45f14d00864ee19ed973d209c3af21e4df4edc Reviewed-on: https://go-review.googlesource.com/32352 Run-TryBot: Michael Munday <munday@ca.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Munday <munday@ca.ibm.com>
2016-10-29 22:11:37 -06:00
MOVD $coshtab<>+0(SB), R5
WFMADB V4, V6, V1, V3
WORD $0xEC4139BC //risbg %r4,%r1,57,128+60,3
BYTE $0x03
BYTE $0x55
WFMSDB V4, V6, V1, V6
WORD $0x68145000 //ld %f1,0(%r4,%r5)
WFMSDB V4, V1, V0, V2
WORD $0xA7487FBE //lhi %r4,32702
FMADD F3, F2, F1
math: use SIMD to accelerate some scalar math functions on s390x Note, most math functions are structured to use stubs, so that they can be accelerated with assembly on any platform. Sinh, cosh, and tanh were not structued with stubs, so this CL does that. This set of routines was chosen as likely to produce good speedups with assembly on any platform. Technique used was minimax polynomial approximation using tables of polynomial coefficients, with argument range reduction. A table of scaling factors was also used for cosh and log10. before after speedup BenchmarkCos 22.1 ns/op 6.79 ns/op 3.25x BenchmarkCosh 125 ns/op 11.7 ns/op 10.68x BenchmarkLog10 48.4 ns/op 12.5 ns/op 3.87x BenchmarkSin 22.2 ns/op 6.55 ns/op 3.39x BenchmarkSinh 125 ns/op 14.2 ns/op 8.80x BenchmarkTanh 65.0 ns/op 15.1 ns/op 4.30x Accuracy was tested against a high precision reference function to determine maximum error. Approximately 4,000,000 points were tested for each function, producing the following result. Note: ulperr is error in "units in the last place" max ulperr sin 1.43 (returns NaN beyond +-2^50) cos 1.79 (returns NaN beyond +-2^50) cosh 1.05 sinh 3.02 tanh 3.69 log10 1.75 Also includes a set of tests to test non-vector functions even when SIMD is enabled Change-Id: Icb45f14d00864ee19ed973d209c3af21e4df4edc Reviewed-on: https://go-review.googlesource.com/32352 Run-TryBot: Michael Munday <munday@ca.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Munday <munday@ca.ibm.com>
2016-10-29 22:11:37 -06:00
SUBW R1, R4
WORD $0xECC439BC //risbg %r12,%r4,57,128+60,3
BYTE $0x03
BYTE $0x55
WORD $0x682C5000 //ld %f2,0(%r12,%r5)
FMSUB F2, F4, F0
math: use SIMD to accelerate some scalar math functions on s390x Note, most math functions are structured to use stubs, so that they can be accelerated with assembly on any platform. Sinh, cosh, and tanh were not structued with stubs, so this CL does that. This set of routines was chosen as likely to produce good speedups with assembly on any platform. Technique used was minimax polynomial approximation using tables of polynomial coefficients, with argument range reduction. A table of scaling factors was also used for cosh and log10. before after speedup BenchmarkCos 22.1 ns/op 6.79 ns/op 3.25x BenchmarkCosh 125 ns/op 11.7 ns/op 10.68x BenchmarkLog10 48.4 ns/op 12.5 ns/op 3.87x BenchmarkSin 22.2 ns/op 6.55 ns/op 3.39x BenchmarkSinh 125 ns/op 14.2 ns/op 8.80x BenchmarkTanh 65.0 ns/op 15.1 ns/op 4.30x Accuracy was tested against a high precision reference function to determine maximum error. Approximately 4,000,000 points were tested for each function, producing the following result. Note: ulperr is error in "units in the last place" max ulperr sin 1.43 (returns NaN beyond +-2^50) cos 1.79 (returns NaN beyond +-2^50) cosh 1.05 sinh 3.02 tanh 3.69 log10 1.75 Also includes a set of tests to test non-vector functions even when SIMD is enabled Change-Id: Icb45f14d00864ee19ed973d209c3af21e4df4edc Reviewed-on: https://go-review.googlesource.com/32352 Run-TryBot: Michael Munday <munday@ca.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Munday <munday@ca.ibm.com>
2016-10-29 22:11:37 -06:00
WORD $0xEC21000F //risbgn %r2,%r1,64-64+0,64-64+0+16-1,64-0-16
BYTE $0x30
BYTE $0x59
WFMADB V0, V6, V2, V6
WORD $0xEC34000F //risbgn %r3,%r4,64-64+0,64-64+0+16-1,64-0-16
BYTE $0x30
BYTE $0x59
LDGR R2, F2
LDGR R3, F0
FMADD F2, F1, F2
FMADD F0, F6, F0
math: use SIMD to accelerate some scalar math functions on s390x Note, most math functions are structured to use stubs, so that they can be accelerated with assembly on any platform. Sinh, cosh, and tanh were not structued with stubs, so this CL does that. This set of routines was chosen as likely to produce good speedups with assembly on any platform. Technique used was minimax polynomial approximation using tables of polynomial coefficients, with argument range reduction. A table of scaling factors was also used for cosh and log10. before after speedup BenchmarkCos 22.1 ns/op 6.79 ns/op 3.25x BenchmarkCosh 125 ns/op 11.7 ns/op 10.68x BenchmarkLog10 48.4 ns/op 12.5 ns/op 3.87x BenchmarkSin 22.2 ns/op 6.55 ns/op 3.39x BenchmarkSinh 125 ns/op 14.2 ns/op 8.80x BenchmarkTanh 65.0 ns/op 15.1 ns/op 4.30x Accuracy was tested against a high precision reference function to determine maximum error. Approximately 4,000,000 points were tested for each function, producing the following result. Note: ulperr is error in "units in the last place" max ulperr sin 1.43 (returns NaN beyond +-2^50) cos 1.79 (returns NaN beyond +-2^50) cosh 1.05 sinh 3.02 tanh 3.69 log10 1.75 Also includes a set of tests to test non-vector functions even when SIMD is enabled Change-Id: Icb45f14d00864ee19ed973d209c3af21e4df4edc Reviewed-on: https://go-review.googlesource.com/32352 Run-TryBot: Michael Munday <munday@ca.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Munday <munday@ca.ibm.com>
2016-10-29 22:11:37 -06:00
FADD F2, F0
FMOVD F0, ret+8(FP)
RET
L22:
WORD $0xA7387FBE //lhi %r3,32702
MOVD $coshtab<>+0(SB), R4
SUBW R1, R3
WFMSDB V3, V6, V1, V6
WORD $0xEC3339BC //risbg %r3,%r3,57,128+60,3
BYTE $0x03
BYTE $0x55
WORD $0x68034000 //ld %f0,0(%r3,%r4)
FMSUB F0, F3, F2
math: use SIMD to accelerate some scalar math functions on s390x Note, most math functions are structured to use stubs, so that they can be accelerated with assembly on any platform. Sinh, cosh, and tanh were not structued with stubs, so this CL does that. This set of routines was chosen as likely to produce good speedups with assembly on any platform. Technique used was minimax polynomial approximation using tables of polynomial coefficients, with argument range reduction. A table of scaling factors was also used for cosh and log10. before after speedup BenchmarkCos 22.1 ns/op 6.79 ns/op 3.25x BenchmarkCosh 125 ns/op 11.7 ns/op 10.68x BenchmarkLog10 48.4 ns/op 12.5 ns/op 3.87x BenchmarkSin 22.2 ns/op 6.55 ns/op 3.39x BenchmarkSinh 125 ns/op 14.2 ns/op 8.80x BenchmarkTanh 65.0 ns/op 15.1 ns/op 4.30x Accuracy was tested against a high precision reference function to determine maximum error. Approximately 4,000,000 points were tested for each function, producing the following result. Note: ulperr is error in "units in the last place" max ulperr sin 1.43 (returns NaN beyond +-2^50) cos 1.79 (returns NaN beyond +-2^50) cosh 1.05 sinh 3.02 tanh 3.69 log10 1.75 Also includes a set of tests to test non-vector functions even when SIMD is enabled Change-Id: Icb45f14d00864ee19ed973d209c3af21e4df4edc Reviewed-on: https://go-review.googlesource.com/32352 Run-TryBot: Michael Munday <munday@ca.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Munday <munday@ca.ibm.com>
2016-10-29 22:11:37 -06:00
WORD $0xA7386FBE //lhi %r3,28606
WFMADB V2, V6, V0, V6
SUBW R1, R3, R1
BR L17
L21:
MOVD $coshxinf<>+0(SB), R1
FMOVD 0(R1), F0
FMOVD F0, ret+8(FP)
RET