cmd/internal/gc: use hardware instruction for math.Sqrt (amd64/arm)

I first prototyped this change in Sept 2011, and I discarded it because it made no difference in the obvious benchmark loop. It still makes no difference in the obvious benchmark loop, but in a less obvious one, doing some extra computation around the calls to Sqrt, not making the call does have a significant effect. benchmark old ns/op new ns/op delta BenchmarkSqrt 4.56 4.57 +0.22% BenchmarkSqrtIndirect 4.56 4.56 +0.00% BenchmarkSqrtGo 69.4 69.4 +0.00% BenchmarkSqrtPrime 4417 3647 -17.43% This is a warmup for using hardware expansions for some calls to 1-line assembly routines in the runtime (for example getg). Change-Id: Ie66be23f8c09d0f7dc4ddd7ca8a93cfce28f55a4 Reviewed-on: https://go-review.googlesource.com/8356 Reviewed-by: Rob Pike <r@golang.org> Reviewed-by: Ian Lance Taylor <iant@golang.org>
2024-11-19 12:04:43 -07:00 · 2015-04-01 16:02:34 -04:00 · 2015-04-01 16:02:34 -04:00 · 92dba0d278
commit 92dba0d278
parent 90c0fefe51
11 changed files with 91 additions and 13 deletions
--- a/src/cmd/5g/gsubr.go
+++ b/src/cmd/5g/gsubr.go
@ -1055,6 +1055,9 @@ func optoas(op int, t *gc.Type) int {

 	case gc.ODIV<<16 | gc.TFLOAT64:
 		a = arm.ADIVD
+
+	case gc.OSQRT<<16 | gc.TFLOAT64:
+		a = arm.ASQRTD
 	}

 	return a
--- a/src/cmd/5g/peep.go
+++ b/src/cmd/5g/peep.go
@ -1101,6 +1101,7 @@ func copyu(p *obj.Prog, v *obj.Addr, s *obj.Addr) int {
 		return 0

 	case obj.ANOP, /* read,, write */
+		arm.ASQRTD,
 		arm.AMOVW,
 		arm.AMOVF,
 		arm.AMOVD,
--- a/src/cmd/5g/prog.go
+++ b/src/cmd/5g/prog.go
@ -70,16 +70,17 @@ var progtable = [arm.ALAST]obj.ProgInfo{
 	arm.ATST:    {gc.SizeL | gc.LeftRead | gc.RightRead, 0, 0, 0},

 	// Floating point.
-	arm.AADDD: {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
-	arm.AADDF: {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
-	arm.ACMPD: {gc.SizeD | gc.LeftRead | gc.RightRead, 0, 0, 0},
-	arm.ACMPF: {gc.SizeF | gc.LeftRead | gc.RightRead, 0, 0, 0},
-	arm.ADIVD: {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
-	arm.ADIVF: {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
-	arm.AMULD: {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
-	arm.AMULF: {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
-	arm.ASUBD: {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
-	arm.ASUBF: {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.AADDD:  {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.AADDF:  {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.ACMPD:  {gc.SizeD | gc.LeftRead | gc.RightRead, 0, 0, 0},
+	arm.ACMPF:  {gc.SizeF | gc.LeftRead | gc.RightRead, 0, 0, 0},
+	arm.ADIVD:  {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.ADIVF:  {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.AMULD:  {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.AMULF:  {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.ASUBD:  {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.ASUBF:  {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.ASQRTD: {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},

 	// Conversions.
 	arm.AMOVWD: {gc.SizeD | gc.LeftRead | gc.RightWrite | gc.Conv, 0, 0, 0},
--- a/src/cmd/6g/gsubr.go
+++ b/src/cmd/6g/gsubr.go
@ -1131,6 +1131,9 @@ func optoas(op int, t *gc.Type) int {

 	case gc.ODIV<<16 | gc.TFLOAT64:
 		a = x86.ADIVSD
+
+	case gc.OSQRT<<16 | gc.TFLOAT64:
+		a = x86.ASQRTSD
 	}

 	return a
--- a/src/cmd/6g/prog.go
+++ b/src/cmd/6g/prog.go
@ -204,6 +204,7 @@ var progtable = [x86.ALAST]obj.ProgInfo{
 	x86.ASHRL:     {gc.SizeL | gc.LeftRead | RightRdwr | gc.ShiftCX | gc.SetCarry, 0, 0, 0},
 	x86.ASHRQ:     {gc.SizeQ | gc.LeftRead | RightRdwr | gc.ShiftCX | gc.SetCarry, 0, 0, 0},
 	x86.ASHRW:     {gc.SizeW | gc.LeftRead | RightRdwr | gc.ShiftCX | gc.SetCarry, 0, 0, 0},
+	x86.ASQRTSD:   {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
 	x86.ASTOSB:    {gc.OK, AX | DI, DI, 0},
 	x86.ASTOSL:    {gc.OK, AX | DI, DI, 0},
 	x86.ASTOSQ:    {gc.OK, AX | DI, DI, 0},
--- a/src/cmd/internal/gc/cgen.go
+++ b/src/cmd/internal/gc/cgen.go
@ -409,6 +409,15 @@ func Cgen(n *Node, res *Node) {
 		cgen_norm(n, &n1, res)
 		return

+	case OSQRT:
+		var n1 Node
+		Regalloc(&n1, nl.Type, res)
+		Cgen(n.Left, &n1)
+		Thearch.Gins(Thearch.Optoas(OSQRT, nl.Type), &n1, &n1)
+		Thearch.Gmove(&n1, res)
+		Regfree(&n1)
+		return
+
 		// symmetric binary
 	case OAND,
 		OOR,
--- a/src/cmd/internal/gc/gen.go
+++ b/src/cmd/internal/gc/gen.go
@ -1002,6 +1002,9 @@ func gen(n *Node) {
 	case ORETURN, ORETJMP:
 		cgen_ret(n)

+	case OSQRT:
+		cgen_discard(n.Left)
+
 	case OCHECKNIL:
 		Cgen_checknil(n.Left)

--- a/src/cmd/internal/gc/syntax.go
+++ b/src/cmd/internal/gc/syntax.go
@ -293,7 +293,7 @@ const (
 	OREGISTER // a register, such as AX.
 	OINDREG   // offset plus indirect of a register, such as 8(SP).

-	// 386/amd64-specific opcodes
+	// arch-specific opcodes
 	OCMP    // compare: ACMP.
 	ODEC    // decrement: ADEC.
 	OINC    // increment: AINC.
@ -303,6 +303,7 @@ const (
 	ORROTC  // right rotate-carry: ARCR.
 	ORETJMP // return to other function
 	OPS     // compare parity set (for x86 NaN check)
+	OSQRT   // sqrt(float64), on systems that have hw support

 	OEND
 )
--- a/src/cmd/internal/gc/walk.go
+++ b/src/cmd/internal/gc/walk.go
@ -622,6 +622,16 @@ func walkexpr(np **Node, init **NodeList) {
 		walkexpr(&n.Left, init)
 		walkexprlist(n.List, init)

+		if n.Left.Op == ONAME && n.Left.Sym.Name == "Sqrt" && n.Left.Sym.Pkg.Path == "math" {
+			switch Thearch.Thechar {
+			case '5', '6':
+				n.Op = OSQRT
+				n.Left = n.List.N
+				n.List = nil
+				goto ret
+			}
+		}
+
 		ll := ascompatte(int(n.Op), n, n.Isddd, getinarg(t), n.List, 0, init)
 		n.List = reorder1(ll)
 		goto ret
--- a/src/math/all_test.go
+++ b/src/math/all_test.go
@ -2977,15 +2977,56 @@ func BenchmarkSinh(b *testing.B) {
 	}
 }

+var Global float64
+
 func BenchmarkSqrt(b *testing.B) {
+	x, y := 0.0, 10.0
 	for i := 0; i < b.N; i++ {
-		Sqrt(10)
+		x += Sqrt(y)
 	}
+	Global = x
+}
+
+func BenchmarkSqrtIndirect(b *testing.B) {
+	x, y := 0.0, 10.0
+	f := Sqrt
+	for i := 0; i < b.N; i++ {
+		x += f(y)
+	}
+	Global = x
 }

 func BenchmarkSqrtGo(b *testing.B) {
+	x, y := 0.0, 10.0
 	for i := 0; i < b.N; i++ {
-		SqrtGo(10)
+		x += SqrtGo(y)
+	}
+	Global = x
+}
+
+func isPrime(i int) bool {
+	// Yes, this is a dumb way to write this code,
+	// but calling Sqrt repeatedly in this way demonstrates
+	// the benefit of using a direct SQRT instruction on systems
+	// that have one, whereas the obvious loop seems not to
+	// demonstrate such a benefit.
+	for j := 2; float64(j) <= Sqrt(float64(i)); j++ {
+		if i%j == 0 {
+			return false
+		}
+	}
+	return true
+}
+
+func BenchmarkSqrtPrime(b *testing.B) {
+	any := false
+	for i := 0; i < b.N; i++ {
+		if isPrime(100003) {
+			any = true
+		}
+	}
+	if any {
+		Global = 1
 	}
 }

--- a/src/math/sqrt.go
+++ b/src/math/sqrt.go
@ -91,6 +91,11 @@ package math
 //	Sqrt(NaN) = NaN
 func Sqrt(x float64) float64

+// Note: Sqrt is implemented in assembly on some systems.
+// Others have assembly stubs that jump to func sqrt below.
+// On systems where Sqrt is a single instruction, the compiler
+// may turn a direct call into a direct use of that instruction instead.
+
 func sqrt(x float64) float64 {
 	// special cases
 	switch {