mirror of
https://github.com/golang/go
synced 2024-11-26 22:51:23 -07:00
1d20a362d0
Currently almost all math functions have the following pattern: func Sin(x float64) float64 func sin(x float64) float64 { // ... pure Go implementation ... } Architectures that implement a function in assembly provide the assembly implementation directly as the exported function (e.g., Sin), and architectures that don't implement it in assembly use a small stub to jump back to the Go code, like: TEXT ·Sin(SB), NOSPLIT, $0 JMP ·sin(SB) However, most functions are not implemented in assembly on most architectures, so this jump through assembly is a waste. It defeats compiler optimizations like inlining. And, with regabi, it actually adds a small but non-trivial overhead because the jump from assembly back to Go must go through an ABI0->ABIInternal bridge function. Hence, this CL reorganizes this structure across the entire package. It now leans on inlining to achieve peak performance, but allows the compiler to see all the way through the pure Go implementation. Now, functions follow this pattern: func Sin(x float64) float64 { if haveArchSin { return archSin(x) } return sin(x) } func sin(x float64) float64 { // ... pure Go implementation ... } Architectures that have assembly implementations use build-tagged files to set haveArchX to true an provide an archX implementation. That implementation can also still call back into the Go implementation (some of them do this). Prior to this change, enabling ABI wrappers results in a geomean slowdown of the math benchmarks of 8.77% (full results: https://perf.golang.org/search?q=upload:20210415.6) and of the Tile38 benchmarks by ~4%. After this change, enabling ABI wrappers is completely performance-neutral on Tile38 and all but one math benchmark (full results: https://perf.golang.org/search?q=upload:20210415.7). ABI wrappers slow down SqrtIndirectLatency-12 by 2.09%, which makes sense because that call must still go through an ABI wrapper. With ABI wrappers disabled (which won't be an option on amd64 much longer), on linux/amd64, this change is largely performance-neutral and slightly improves the performance of a few benchmarks: (Because there are so many benchmarks, I've applied the Šidák correction to the alpha threshold. It makes relatively little difference in which benchmarks are statistically significant.) name old time/op new time/op delta Acos-12 22.3ns ± 0% 18.8ns ± 1% -15.44% (p=0.000 n=18+16) Acosh-12 28.2ns ± 0% 28.2ns ± 0% ~ (p=0.404 n=18+20) Asin-12 18.1ns ± 0% 18.2ns ± 0% +0.20% (p=0.000 n=18+16) Asinh-12 32.8ns ± 0% 32.9ns ± 1% ~ (p=0.891 n=18+20) Atan-12 9.92ns ± 0% 9.90ns ± 1% -0.24% (p=0.000 n=17+16) Atanh-12 27.7ns ± 0% 27.5ns ± 0% -0.72% (p=0.000 n=16+20) Atan2-12 18.5ns ± 0% 18.4ns ± 0% -0.59% (p=0.000 n=19+19) Cbrt-12 22.1ns ± 0% 22.1ns ± 0% ~ (p=0.804 n=16+17) Ceil-12 0.84ns ± 0% 0.84ns ± 0% ~ (p=0.663 n=18+16) Copysign-12 0.84ns ± 0% 0.84ns ± 0% ~ (p=0.762 n=16+19) Cos-12 12.7ns ± 0% 12.7ns ± 1% ~ (p=0.145 n=19+18) Cosh-12 22.2ns ± 0% 22.5ns ± 0% +1.60% (p=0.000 n=17+19) Erf-12 11.1ns ± 1% 11.1ns ± 1% ~ (p=0.010 n=19+19) Erfc-12 12.6ns ± 1% 12.7ns ± 0% ~ (p=0.066 n=19+15) Erfinv-12 16.1ns ± 0% 16.1ns ± 0% ~ (p=0.462 n=17+20) Erfcinv-12 16.0ns ± 1% 16.0ns ± 1% ~ (p=0.015 n=17+16) Exp-12 16.3ns ± 0% 16.5ns ± 1% +1.25% (p=0.000 n=19+16) ExpGo-12 36.2ns ± 1% 36.1ns ± 1% ~ (p=0.242 n=20+18) Expm1-12 18.6ns ± 0% 18.7ns ± 0% +0.25% (p=0.000 n=16+19) Exp2-12 34.7ns ± 0% 34.6ns ± 1% ~ (p=0.010 n=19+18) Exp2Go-12 34.8ns ± 1% 34.8ns ± 1% ~ (p=0.372 n=19+19) Abs-12 0.56ns ± 0% 0.56ns ± 0% ~ (p=0.766 n=18+16) Dim-12 0.84ns ± 1% 0.84ns ± 1% ~ (p=0.167 n=17+19) Floor-12 0.84ns ± 0% 0.84ns ± 0% ~ (p=0.993 n=18+16) Max-12 3.35ns ± 0% 3.35ns ± 0% ~ (p=0.894 n=17+19) Min-12 3.35ns ± 0% 3.36ns ± 1% ~ (p=0.214 n=18+18) Mod-12 35.2ns ± 0% 34.7ns ± 0% -1.45% (p=0.000 n=18+17) Frexp-12 5.31ns ± 0% 4.75ns ± 0% -10.51% (p=0.000 n=19+18) Gamma-12 14.8ns ± 0% 16.2ns ± 1% +9.21% (p=0.000 n=20+19) Hypot-12 6.16ns ± 0% 6.17ns ± 0% +0.26% (p=0.000 n=19+20) HypotGo-12 7.79ns ± 1% 7.78ns ± 0% ~ (p=0.497 n=18+17) Ilogb-12 4.47ns ± 0% 4.47ns ± 0% ~ (p=0.167 n=19+19) J0-12 76.0ns ± 0% 76.3ns ± 0% +0.35% (p=0.000 n=19+18) J1-12 76.8ns ± 1% 75.9ns ± 0% -1.14% (p=0.000 n=18+18) Jn-12 167ns ± 1% 168ns ± 1% ~ (p=0.038 n=18+18) Ldexp-12 6.98ns ± 0% 6.43ns ± 0% -7.97% (p=0.000 n=17+18) Lgamma-12 15.9ns ± 0% 16.0ns ± 1% ~ (p=0.011 n=20+17) Log-12 13.3ns ± 0% 13.4ns ± 1% +0.37% (p=0.000 n=15+18) Logb-12 4.75ns ± 0% 4.75ns ± 0% ~ (p=0.831 n=16+18) Log1p-12 19.5ns ± 0% 19.5ns ± 1% ~ (p=0.851 n=18+17) Log10-12 15.9ns ± 0% 14.0ns ± 0% -11.92% (p=0.000 n=17+16) Log2-12 7.88ns ± 1% 8.01ns ± 0% +1.72% (p=0.000 n=20+20) Modf-12 4.75ns ± 0% 4.34ns ± 0% -8.66% (p=0.000 n=19+17) Nextafter32-12 5.31ns ± 0% 5.31ns ± 0% ~ (p=0.389 n=17+18) Nextafter64-12 5.03ns ± 1% 5.03ns ± 0% ~ (p=0.774 n=17+18) PowInt-12 29.9ns ± 0% 28.5ns ± 0% -4.69% (p=0.000 n=18+19) PowFrac-12 91.0ns ± 0% 91.1ns ± 0% ~ (p=0.029 n=19+19) Pow10Pos-12 1.12ns ± 0% 1.12ns ± 0% ~ (p=0.363 n=20+20) Pow10Neg-12 3.90ns ± 0% 3.90ns ± 0% ~ (p=0.921 n=17+18) Round-12 2.31ns ± 0% 2.31ns ± 1% ~ (p=0.390 n=18+18) RoundToEven-12 0.84ns ± 0% 0.84ns ± 0% ~ (p=0.280 n=18+19) Remainder-12 31.6ns ± 0% 29.6ns ± 0% -6.16% (p=0.000 n=18+17) Signbit-12 0.56ns ± 0% 0.56ns ± 0% ~ (p=0.385 n=19+18) Sin-12 12.5ns ± 0% 12.5ns ± 0% ~ (p=0.080 n=18+18) Sincos-12 16.4ns ± 2% 16.4ns ± 2% ~ (p=0.253 n=20+19) Sinh-12 26.1ns ± 0% 26.1ns ± 0% +0.18% (p=0.000 n=17+19) SqrtIndirect-12 3.91ns ± 0% 3.90ns ± 0% ~ (p=0.133 n=19+19) SqrtLatency-12 2.79ns ± 0% 2.79ns ± 0% ~ (p=0.226 n=16+19) SqrtIndirectLatency-12 6.68ns ± 0% 6.37ns ± 2% -4.66% (p=0.000 n=17+20) SqrtGoLatency-12 49.4ns ± 0% 49.4ns ± 0% ~ (p=0.289 n=18+16) SqrtPrime-12 3.18µs ± 0% 3.18µs ± 0% ~ (p=0.084 n=17+18) Tan-12 13.8ns ± 0% 13.9ns ± 2% ~ (p=0.292 n=19+20) Tanh-12 25.4ns ± 0% 25.4ns ± 0% ~ (p=0.101 n=17+17) Trunc-12 0.84ns ± 0% 0.84ns ± 0% ~ (p=0.765 n=18+16) Y0-12 75.8ns ± 0% 75.9ns ± 1% ~ (p=0.805 n=16+18) Y1-12 76.3ns ± 0% 75.3ns ± 1% -1.34% (p=0.000 n=19+17) Yn-12 164ns ± 0% 164ns ± 2% ~ (p=0.356 n=18+20) Float64bits-12 0.56ns ± 0% 0.56ns ± 0% ~ (p=0.383 n=18+18) Float64frombits-12 0.56ns ± 0% 0.56ns ± 0% ~ (p=0.066 n=18+19) Float32bits-12 0.56ns ± 0% 0.56ns ± 0% ~ (p=0.889 n=16+19) Float32frombits-12 0.56ns ± 0% 0.56ns ± 0% ~ (p=0.007 n=18+19) FMA-12 23.9ns ± 0% 24.0ns ± 0% +0.31% (p=0.000 n=16+17) [Geo mean] 9.86ns 9.77ns -0.87% (https://perf.golang.org/search?q=upload:20210415.5) For #40724. Change-Id: I44fbba2a17be930ec9daeb0a8222f55cd50555a0 Reviewed-on: https://go-review.googlesource.com/c/go/+/310331 Trust: Austin Clements <austin@google.com> Reviewed-by: Cherry Zhang <cherryyz@google.com>
202 lines
5.4 KiB
Go
202 lines
5.4 KiB
Go
// Copyright 2009 The Go Authors. All rights reserved.
|
||
// Use of this source code is governed by a BSD-style
|
||
// license that can be found in the LICENSE file.
|
||
|
||
package math
|
||
|
||
// Exp returns e**x, the base-e exponential of x.
|
||
//
|
||
// Special cases are:
|
||
// Exp(+Inf) = +Inf
|
||
// Exp(NaN) = NaN
|
||
// Very large values overflow to 0 or +Inf.
|
||
// Very small values underflow to 1.
|
||
func Exp(x float64) float64 {
|
||
if haveArchExp {
|
||
return archExp(x)
|
||
}
|
||
return exp(x)
|
||
}
|
||
|
||
// The original C code, the long comment, and the constants
|
||
// below are from FreeBSD's /usr/src/lib/msun/src/e_exp.c
|
||
// and came with this notice. The go code is a simplified
|
||
// version of the original C.
|
||
//
|
||
// ====================================================
|
||
// Copyright (C) 2004 by Sun Microsystems, Inc. All rights reserved.
|
||
//
|
||
// Permission to use, copy, modify, and distribute this
|
||
// software is freely granted, provided that this notice
|
||
// is preserved.
|
||
// ====================================================
|
||
//
|
||
//
|
||
// exp(x)
|
||
// Returns the exponential of x.
|
||
//
|
||
// Method
|
||
// 1. Argument reduction:
|
||
// Reduce x to an r so that |r| <= 0.5*ln2 ~ 0.34658.
|
||
// Given x, find r and integer k such that
|
||
//
|
||
// x = k*ln2 + r, |r| <= 0.5*ln2.
|
||
//
|
||
// Here r will be represented as r = hi-lo for better
|
||
// accuracy.
|
||
//
|
||
// 2. Approximation of exp(r) by a special rational function on
|
||
// the interval [0,0.34658]:
|
||
// Write
|
||
// R(r**2) = r*(exp(r)+1)/(exp(r)-1) = 2 + r*r/6 - r**4/360 + ...
|
||
// We use a special Remez algorithm on [0,0.34658] to generate
|
||
// a polynomial of degree 5 to approximate R. The maximum error
|
||
// of this polynomial approximation is bounded by 2**-59. In
|
||
// other words,
|
||
// R(z) ~ 2.0 + P1*z + P2*z**2 + P3*z**3 + P4*z**4 + P5*z**5
|
||
// (where z=r*r, and the values of P1 to P5 are listed below)
|
||
// and
|
||
// | 5 | -59
|
||
// | 2.0+P1*z+...+P5*z - R(z) | <= 2
|
||
// | |
|
||
// The computation of exp(r) thus becomes
|
||
// 2*r
|
||
// exp(r) = 1 + -------
|
||
// R - r
|
||
// r*R1(r)
|
||
// = 1 + r + ----------- (for better accuracy)
|
||
// 2 - R1(r)
|
||
// where
|
||
// 2 4 10
|
||
// R1(r) = r - (P1*r + P2*r + ... + P5*r ).
|
||
//
|
||
// 3. Scale back to obtain exp(x):
|
||
// From step 1, we have
|
||
// exp(x) = 2**k * exp(r)
|
||
//
|
||
// Special cases:
|
||
// exp(INF) is INF, exp(NaN) is NaN;
|
||
// exp(-INF) is 0, and
|
||
// for finite argument, only exp(0)=1 is exact.
|
||
//
|
||
// Accuracy:
|
||
// according to an error analysis, the error is always less than
|
||
// 1 ulp (unit in the last place).
|
||
//
|
||
// Misc. info.
|
||
// For IEEE double
|
||
// if x > 7.09782712893383973096e+02 then exp(x) overflow
|
||
// if x < -7.45133219101941108420e+02 then exp(x) underflow
|
||
//
|
||
// Constants:
|
||
// The hexadecimal values are the intended ones for the following
|
||
// constants. The decimal values may be used, provided that the
|
||
// compiler will convert from decimal to binary accurately enough
|
||
// to produce the hexadecimal values shown.
|
||
|
||
func exp(x float64) float64 {
|
||
const (
|
||
Ln2Hi = 6.93147180369123816490e-01
|
||
Ln2Lo = 1.90821492927058770002e-10
|
||
Log2e = 1.44269504088896338700e+00
|
||
|
||
Overflow = 7.09782712893383973096e+02
|
||
Underflow = -7.45133219101941108420e+02
|
||
NearZero = 1.0 / (1 << 28) // 2**-28
|
||
)
|
||
|
||
// special cases
|
||
switch {
|
||
case IsNaN(x) || IsInf(x, 1):
|
||
return x
|
||
case IsInf(x, -1):
|
||
return 0
|
||
case x > Overflow:
|
||
return Inf(1)
|
||
case x < Underflow:
|
||
return 0
|
||
case -NearZero < x && x < NearZero:
|
||
return 1 + x
|
||
}
|
||
|
||
// reduce; computed as r = hi - lo for extra precision.
|
||
var k int
|
||
switch {
|
||
case x < 0:
|
||
k = int(Log2e*x - 0.5)
|
||
case x > 0:
|
||
k = int(Log2e*x + 0.5)
|
||
}
|
||
hi := x - float64(k)*Ln2Hi
|
||
lo := float64(k) * Ln2Lo
|
||
|
||
// compute
|
||
return expmulti(hi, lo, k)
|
||
}
|
||
|
||
// Exp2 returns 2**x, the base-2 exponential of x.
|
||
//
|
||
// Special cases are the same as Exp.
|
||
func Exp2(x float64) float64 {
|
||
if haveArchExp2 {
|
||
return archExp2(x)
|
||
}
|
||
return exp2(x)
|
||
}
|
||
|
||
func exp2(x float64) float64 {
|
||
const (
|
||
Ln2Hi = 6.93147180369123816490e-01
|
||
Ln2Lo = 1.90821492927058770002e-10
|
||
|
||
Overflow = 1.0239999999999999e+03
|
||
Underflow = -1.0740e+03
|
||
)
|
||
|
||
// special cases
|
||
switch {
|
||
case IsNaN(x) || IsInf(x, 1):
|
||
return x
|
||
case IsInf(x, -1):
|
||
return 0
|
||
case x > Overflow:
|
||
return Inf(1)
|
||
case x < Underflow:
|
||
return 0
|
||
}
|
||
|
||
// argument reduction; x = r×lg(e) + k with |r| ≤ ln(2)/2.
|
||
// computed as r = hi - lo for extra precision.
|
||
var k int
|
||
switch {
|
||
case x > 0:
|
||
k = int(x + 0.5)
|
||
case x < 0:
|
||
k = int(x - 0.5)
|
||
}
|
||
t := x - float64(k)
|
||
hi := t * Ln2Hi
|
||
lo := -t * Ln2Lo
|
||
|
||
// compute
|
||
return expmulti(hi, lo, k)
|
||
}
|
||
|
||
// exp1 returns e**r × 2**k where r = hi - lo and |r| ≤ ln(2)/2.
|
||
func expmulti(hi, lo float64, k int) float64 {
|
||
const (
|
||
P1 = 1.66666666666666657415e-01 /* 0x3FC55555; 0x55555555 */
|
||
P2 = -2.77777777770155933842e-03 /* 0xBF66C16C; 0x16BEBD93 */
|
||
P3 = 6.61375632143793436117e-05 /* 0x3F11566A; 0xAF25DE2C */
|
||
P4 = -1.65339022054652515390e-06 /* 0xBEBBBD41; 0xC5D26BF1 */
|
||
P5 = 4.13813679705723846039e-08 /* 0x3E663769; 0x72BEA4D0 */
|
||
)
|
||
|
||
r := hi - lo
|
||
t := r * r
|
||
c := r - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))))
|
||
y := 1 - ((lo - (r*c)/(2-c)) - hi)
|
||
// TODO(rsc): make sure Ldexp can handle boundary k
|
||
return Ldexp(y, k)
|
||
}
|