From 2a4158207edb499f8b210aaa7a9af103b93b5ac7 Mon Sep 17 00:00:00 2001
From: Michael Munday <munday@ca.ibm.com>
Date: Sun, 10 Apr 2016 21:58:37 -0400
Subject: [PATCH] cmd/compile/internal/gc: refactor cgen_div

This commit adds two new functions to cgen.go: hasHMUL64 and
hasRROTC64. These are used to determine whether or not an
architecture supports the instructions needed to perform an
optimization in cgen_div.

This commit should not affect existing architectures (although it
does add s390x to the new functions). However, since most
architectures support HMUL the hasHMUL64 function could be
modified to enable most of the optimizations in cgen_div on those
platforms.

Change-Id: I33bf329ddeb6cf2954bd17b7c161012de352fb62
Reviewed-on: https://go-review.googlesource.com/21775
Reviewed-by: Matthew Dempsky <mdempsky@google.com>
Run-TryBot: Matthew Dempsky <mdempsky@google.com>
Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
---
 src/cmd/compile/internal/gc/cgen.go | 68 +++++++++++++++++++++--------
 1 file changed, 49 insertions(+), 19 deletions(-)

diff --git a/src/cmd/compile/internal/gc/cgen.go b/src/cmd/compile/internal/gc/cgen.go
index a9cedf7cfc2..eacbc30f87c 100644
--- a/src/cmd/compile/internal/gc/cgen.go
+++ b/src/cmd/compile/internal/gc/cgen.go
@@ -2622,24 +2622,48 @@ func cgen_ret(n *Node) {
 	}
 }
 
+// hasHMUL64 reports whether the architecture supports 64-bit
+// signed and unsigned high multiplication (OHMUL).
+func hasHMUL64() bool {
+	switch Ctxt.Arch.Family {
+	case sys.AMD64, sys.S390X:
+		return true
+	case sys.ARM, sys.ARM64, sys.I386, sys.MIPS64, sys.PPC64:
+		return false
+	}
+	Fatalf("unknown architecture")
+	return false
+}
+
+// hasRROTC64 reports whether the architecture supports 64-bit
+// rotate through carry instructions (ORROTC).
+func hasRROTC64() bool {
+	switch Ctxt.Arch.Family {
+	case sys.AMD64:
+		return true
+	case sys.ARM, sys.ARM64, sys.I386, sys.MIPS64, sys.PPC64, sys.S390X:
+		return false
+	}
+	Fatalf("unknown architecture")
+	return false
+}
+
 // generate division according to op, one of:
 //	res = nl / nr
 //	res = nl % nr
 func cgen_div(op Op, nl *Node, nr *Node, res *Node) {
 	var w int
 
-	// TODO(rsc): arm64 needs to support the relevant instructions
-	// in peep and optoas in order to enable this.
-	// TODO(rsc): ppc64 needs to support the relevant instructions
-	// in peep and optoas in order to enable this.
-	if nr.Op != OLITERAL || Ctxt.Arch.Family == sys.MIPS64 || Ctxt.Arch.Family == sys.ARM64 || Ctxt.Arch.Family == sys.PPC64 {
+	// Architectures need to support 64-bit high multiplications
+	// (OHMUL) in order to perform divide by constant optimizations.
+	if nr.Op != OLITERAL || !hasHMUL64() {
 		goto longdiv
 	}
 	w = int(nl.Type.Width * 8)
 
 	// Front end handled 32-bit division. We only need to handle 64-bit.
-	// try to do division by multiply by (2^w)/d
-	// see hacker's delight chapter 10
+	// Try to do division using multiplication: (2^w)/d.
+	// See Hacker's Delight, chapter 10.
 	switch Simtype[nl.Type.Etype] {
 	default:
 		goto longdiv
@@ -2652,6 +2676,17 @@ func cgen_div(op Op, nl *Node, nr *Node, res *Node) {
 		if m.Bad != 0 {
 			break
 		}
+
+		// In order to add the numerator we need to be able to
+		// avoid overflow. This is done by shifting the result of the
+		// addition right by 1 and inserting the carry bit into
+		// the MSB. For now this needs the RROTC instruction.
+		// TODO(mundaym): Hacker's Delight 2nd ed. chapter 10 proposes
+		// an alternative sequence of instructions for architectures
+		// that do not have a shift right with carry instruction.
+		if m.Ua != 0 && !hasRROTC64() {
+			goto longdiv
+		}
 		if op == OMOD {
 			goto longmod
 		}
@@ -2665,7 +2700,7 @@ func cgen_div(op Op, nl *Node, nr *Node, res *Node) {
 		Thearch.Cgen_hmul(&n1, &n2, &n3)
 
 		if m.Ua != 0 {
-			// need to add numerator accounting for overflow
+			// Need to add numerator accounting for overflow.
 			Thearch.Gins(Thearch.Optoas(OADD, nl.Type), &n1, &n3)
 
 			Nodconst(&n2, nl.Type, 1)
@@ -2703,7 +2738,7 @@ func cgen_div(op Op, nl *Node, nr *Node, res *Node) {
 		Thearch.Cgen_hmul(&n1, &n2, &n3)
 
 		if m.Sm < 0 {
-			// need to add numerator
+			// Need to add numerator (cannot overflow).
 			Thearch.Gins(Thearch.Optoas(OADD, nl.Type), &n1, &n3)
 		}
 
@@ -2716,8 +2751,8 @@ func cgen_div(op Op, nl *Node, nr *Node, res *Node) {
 		Thearch.Gins(Thearch.Optoas(OSUB, nl.Type), &n1, &n3) // added
 
 		if m.Sd < 0 {
-			// this could probably be removed
-			// by factoring it into the multiplier
+			// This could probably be removed by factoring it into
+			// the multiplier.
 			Thearch.Gins(Thearch.Optoas(OMINUS, nl.Type), nil, &n3)
 		}
 
@@ -2729,14 +2764,14 @@ func cgen_div(op Op, nl *Node, nr *Node, res *Node) {
 
 	goto longdiv
 
-	// division and mod using (slow) hardware instruction
+	// Division and mod using (slow) hardware instruction.
 longdiv:
 	Thearch.Dodiv(op, nl, nr, res)
 
 	return
 
-	// mod using formula A%B = A-(A/B*B) but
-	// we know that there is a fast algorithm for A/B
+	// Mod using formula A%B = A-(A/B*B) but
+	// we know that there is a fast algorithm for A/B.
 longmod:
 	var n1 Node
 	Regalloc(&n1, nl.Type, res)
@@ -2746,11 +2781,6 @@ longmod:
 	Regalloc(&n2, nl.Type, nil)
 	cgen_div(ODIV, &n1, nr, &n2)
 	a := Thearch.Optoas(OMUL, nl.Type)
-	if w == 8 {
-		// use 2-operand 16-bit multiply
-		// because there is no 2-operand 8-bit multiply
-		a = Thearch.Optoas(OMUL, Types[TINT16]) // XXX was IMULW
-	}
 
 	if !Smallintconst(nr) {
 		var n3 Node