diff --git a/src/cmd/5g/gsubr.go b/src/cmd/5g/gsubr.go
index 0d22f74c9a..fe4ed8d1f2 100644
--- a/src/cmd/5g/gsubr.go
+++ b/src/cmd/5g/gsubr.go
@@ -1055,6 +1055,9 @@ func optoas(op int, t *gc.Type) int {
 
 	case gc.ODIV<<16 | gc.TFLOAT64:
 		a = arm.ADIVD
+
+	case gc.OSQRT<<16 | gc.TFLOAT64:
+		a = arm.ASQRTD
 	}
 
 	return a
diff --git a/src/cmd/5g/peep.go b/src/cmd/5g/peep.go
index 5305e4b7f6..9ec3be2eec 100644
--- a/src/cmd/5g/peep.go
+++ b/src/cmd/5g/peep.go
@@ -1101,6 +1101,7 @@ func copyu(p *obj.Prog, v *obj.Addr, s *obj.Addr) int {
 		return 0
 
 	case obj.ANOP, /* read,, write */
+		arm.ASQRTD,
 		arm.AMOVW,
 		arm.AMOVF,
 		arm.AMOVD,
diff --git a/src/cmd/5g/prog.go b/src/cmd/5g/prog.go
index bfb703e56f..c472cdf042 100644
--- a/src/cmd/5g/prog.go
+++ b/src/cmd/5g/prog.go
@@ -70,16 +70,17 @@ var progtable = [arm.ALAST]obj.ProgInfo{
 	arm.ATST:    {gc.SizeL | gc.LeftRead | gc.RightRead, 0, 0, 0},
 
 	// Floating point.
-	arm.AADDD: {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
-	arm.AADDF: {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
-	arm.ACMPD: {gc.SizeD | gc.LeftRead | gc.RightRead, 0, 0, 0},
-	arm.ACMPF: {gc.SizeF | gc.LeftRead | gc.RightRead, 0, 0, 0},
-	arm.ADIVD: {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
-	arm.ADIVF: {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
-	arm.AMULD: {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
-	arm.AMULF: {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
-	arm.ASUBD: {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
-	arm.ASUBF: {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.AADDD:  {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.AADDF:  {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.ACMPD:  {gc.SizeD | gc.LeftRead | gc.RightRead, 0, 0, 0},
+	arm.ACMPF:  {gc.SizeF | gc.LeftRead | gc.RightRead, 0, 0, 0},
+	arm.ADIVD:  {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.ADIVF:  {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.AMULD:  {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.AMULF:  {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.ASUBD:  {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.ASUBF:  {gc.SizeF | gc.LeftRead | RightRdwr, 0, 0, 0},
+	arm.ASQRTD: {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
 
 	// Conversions.
 	arm.AMOVWD: {gc.SizeD | gc.LeftRead | gc.RightWrite | gc.Conv, 0, 0, 0},
diff --git a/src/cmd/6g/gsubr.go b/src/cmd/6g/gsubr.go
index b2290af733..323ea69a98 100644
--- a/src/cmd/6g/gsubr.go
+++ b/src/cmd/6g/gsubr.go
@@ -1131,6 +1131,9 @@ func optoas(op int, t *gc.Type) int {
 
 	case gc.ODIV<<16 | gc.TFLOAT64:
 		a = x86.ADIVSD
+
+	case gc.OSQRT<<16 | gc.TFLOAT64:
+		a = x86.ASQRTSD
 	}
 
 	return a
diff --git a/src/cmd/6g/prog.go b/src/cmd/6g/prog.go
index 0644800257..fe9f013851 100644
--- a/src/cmd/6g/prog.go
+++ b/src/cmd/6g/prog.go
@@ -204,6 +204,7 @@ var progtable = [x86.ALAST]obj.ProgInfo{
 	x86.ASHRL:     {gc.SizeL | gc.LeftRead | RightRdwr | gc.ShiftCX | gc.SetCarry, 0, 0, 0},
 	x86.ASHRQ:     {gc.SizeQ | gc.LeftRead | RightRdwr | gc.ShiftCX | gc.SetCarry, 0, 0, 0},
 	x86.ASHRW:     {gc.SizeW | gc.LeftRead | RightRdwr | gc.ShiftCX | gc.SetCarry, 0, 0, 0},
+	x86.ASQRTSD:   {gc.SizeD | gc.LeftRead | RightRdwr, 0, 0, 0},
 	x86.ASTOSB:    {gc.OK, AX | DI, DI, 0},
 	x86.ASTOSL:    {gc.OK, AX | DI, DI, 0},
 	x86.ASTOSQ:    {gc.OK, AX | DI, DI, 0},
diff --git a/src/cmd/internal/gc/cgen.go b/src/cmd/internal/gc/cgen.go
index b3524c26c4..d3921f7ece 100644
--- a/src/cmd/internal/gc/cgen.go
+++ b/src/cmd/internal/gc/cgen.go
@@ -409,6 +409,15 @@ func Cgen(n *Node, res *Node) {
 		cgen_norm(n, &n1, res)
 		return
 
+	case OSQRT:
+		var n1 Node
+		Regalloc(&n1, nl.Type, res)
+		Cgen(n.Left, &n1)
+		Thearch.Gins(Thearch.Optoas(OSQRT, nl.Type), &n1, &n1)
+		Thearch.Gmove(&n1, res)
+		Regfree(&n1)
+		return
+
 		// symmetric binary
 	case OAND,
 		OOR,
diff --git a/src/cmd/internal/gc/gen.go b/src/cmd/internal/gc/gen.go
index caae2f1ce1..e0659fc8a4 100644
--- a/src/cmd/internal/gc/gen.go
+++ b/src/cmd/internal/gc/gen.go
@@ -1002,6 +1002,9 @@ func gen(n *Node) {
 	case ORETURN, ORETJMP:
 		cgen_ret(n)
 
+	case OSQRT:
+		cgen_discard(n.Left)
+
 	case OCHECKNIL:
 		Cgen_checknil(n.Left)
 
diff --git a/src/cmd/internal/gc/syntax.go b/src/cmd/internal/gc/syntax.go
index 8f5b85db1f..671a624c1d 100644
--- a/src/cmd/internal/gc/syntax.go
+++ b/src/cmd/internal/gc/syntax.go
@@ -293,7 +293,7 @@ const (
 	OREGISTER // a register, such as AX.
 	OINDREG   // offset plus indirect of a register, such as 8(SP).
 
-	// 386/amd64-specific opcodes
+	// arch-specific opcodes
 	OCMP    // compare: ACMP.
 	ODEC    // decrement: ADEC.
 	OINC    // increment: AINC.
@@ -303,6 +303,7 @@ const (
 	ORROTC  // right rotate-carry: ARCR.
 	ORETJMP // return to other function
 	OPS     // compare parity set (for x86 NaN check)
+	OSQRT   // sqrt(float64), on systems that have hw support
 
 	OEND
 )
diff --git a/src/cmd/internal/gc/walk.go b/src/cmd/internal/gc/walk.go
index c10201aa2e..a0a29d35ac 100644
--- a/src/cmd/internal/gc/walk.go
+++ b/src/cmd/internal/gc/walk.go
@@ -622,6 +622,16 @@ func walkexpr(np **Node, init **NodeList) {
 		walkexpr(&n.Left, init)
 		walkexprlist(n.List, init)
 
+		if n.Left.Op == ONAME && n.Left.Sym.Name == "Sqrt" && n.Left.Sym.Pkg.Path == "math" {
+			switch Thearch.Thechar {
+			case '5', '6':
+				n.Op = OSQRT
+				n.Left = n.List.N
+				n.List = nil
+				goto ret
+			}
+		}
+
 		ll := ascompatte(int(n.Op), n, n.Isddd, getinarg(t), n.List, 0, init)
 		n.List = reorder1(ll)
 		goto ret
diff --git a/src/math/all_test.go b/src/math/all_test.go
index c07ac740e3..84061be264 100644
--- a/src/math/all_test.go
+++ b/src/math/all_test.go
@@ -2977,15 +2977,56 @@ func BenchmarkSinh(b *testing.B) {
 	}
 }
 
+var Global float64
+
 func BenchmarkSqrt(b *testing.B) {
+	x, y := 0.0, 10.0
 	for i := 0; i < b.N; i++ {
-		Sqrt(10)
+		x += Sqrt(y)
 	}
+	Global = x
+}
+
+func BenchmarkSqrtIndirect(b *testing.B) {
+	x, y := 0.0, 10.0
+	f := Sqrt
+	for i := 0; i < b.N; i++ {
+		x += f(y)
+	}
+	Global = x
 }
 
 func BenchmarkSqrtGo(b *testing.B) {
+	x, y := 0.0, 10.0
 	for i := 0; i < b.N; i++ {
-		SqrtGo(10)
+		x += SqrtGo(y)
+	}
+	Global = x
+}
+
+func isPrime(i int) bool {
+	// Yes, this is a dumb way to write this code,
+	// but calling Sqrt repeatedly in this way demonstrates
+	// the benefit of using a direct SQRT instruction on systems
+	// that have one, whereas the obvious loop seems not to
+	// demonstrate such a benefit.
+	for j := 2; float64(j) <= Sqrt(float64(i)); j++ {
+		if i%j == 0 {
+			return false
+		}
+	}
+	return true
+}
+
+func BenchmarkSqrtPrime(b *testing.B) {
+	any := false
+	for i := 0; i < b.N; i++ {
+		if isPrime(100003) {
+			any = true
+		}
+	}
+	if any {
+		Global = 1
 	}
 }
 
diff --git a/src/math/sqrt.go b/src/math/sqrt.go
index fdc869992e..23cf2996c2 100644
--- a/src/math/sqrt.go
+++ b/src/math/sqrt.go
@@ -91,6 +91,11 @@ package math
 //	Sqrt(NaN) = NaN
 func Sqrt(x float64) float64
 
+// Note: Sqrt is implemented in assembly on some systems.
+// Others have assembly stubs that jump to func sqrt below.
+// On systems where Sqrt is a single instruction, the compiler
+// may turn a direct call into a direct use of that instruction instead.
+
 func sqrt(x float64) float64 {
 	// special cases
 	switch {