diff --git a/src/cmd/compile/internal/gc/cgen.go b/src/cmd/compile/internal/gc/cgen.go
index a9cedf7cfc2..eacbc30f87c 100644
--- a/src/cmd/compile/internal/gc/cgen.go
+++ b/src/cmd/compile/internal/gc/cgen.go
@@ -2622,24 +2622,48 @@ func cgen_ret(n *Node) {
 	}
 }
 
+// hasHMUL64 reports whether the architecture supports 64-bit
+// signed and unsigned high multiplication (OHMUL).
+func hasHMUL64() bool {
+	switch Ctxt.Arch.Family {
+	case sys.AMD64, sys.S390X:
+		return true
+	case sys.ARM, sys.ARM64, sys.I386, sys.MIPS64, sys.PPC64:
+		return false
+	}
+	Fatalf("unknown architecture")
+	return false
+}
+
+// hasRROTC64 reports whether the architecture supports 64-bit
+// rotate through carry instructions (ORROTC).
+func hasRROTC64() bool {
+	switch Ctxt.Arch.Family {
+	case sys.AMD64:
+		return true
+	case sys.ARM, sys.ARM64, sys.I386, sys.MIPS64, sys.PPC64, sys.S390X:
+		return false
+	}
+	Fatalf("unknown architecture")
+	return false
+}
+
 // generate division according to op, one of:
 //	res = nl / nr
 //	res = nl % nr
 func cgen_div(op Op, nl *Node, nr *Node, res *Node) {
 	var w int
 
-	// TODO(rsc): arm64 needs to support the relevant instructions
-	// in peep and optoas in order to enable this.
-	// TODO(rsc): ppc64 needs to support the relevant instructions
-	// in peep and optoas in order to enable this.
-	if nr.Op != OLITERAL || Ctxt.Arch.Family == sys.MIPS64 || Ctxt.Arch.Family == sys.ARM64 || Ctxt.Arch.Family == sys.PPC64 {
+	// Architectures need to support 64-bit high multiplications
+	// (OHMUL) in order to perform divide by constant optimizations.
+	if nr.Op != OLITERAL || !hasHMUL64() {
 		goto longdiv
 	}
 	w = int(nl.Type.Width * 8)
 
 	// Front end handled 32-bit division. We only need to handle 64-bit.
-	// try to do division by multiply by (2^w)/d
-	// see hacker's delight chapter 10
+	// Try to do division using multiplication: (2^w)/d.
+	// See Hacker's Delight, chapter 10.
 	switch Simtype[nl.Type.Etype] {
 	default:
 		goto longdiv
@@ -2652,6 +2676,17 @@ func cgen_div(op Op, nl *Node, nr *Node, res *Node) {
 		if m.Bad != 0 {
 			break
 		}
+
+		// In order to add the numerator we need to be able to
+		// avoid overflow. This is done by shifting the result of the
+		// addition right by 1 and inserting the carry bit into
+		// the MSB. For now this needs the RROTC instruction.
+		// TODO(mundaym): Hacker's Delight 2nd ed. chapter 10 proposes
+		// an alternative sequence of instructions for architectures
+		// that do not have a shift right with carry instruction.
+		if m.Ua != 0 && !hasRROTC64() {
+			goto longdiv
+		}
 		if op == OMOD {
 			goto longmod
 		}
@@ -2665,7 +2700,7 @@ func cgen_div(op Op, nl *Node, nr *Node, res *Node) {
 		Thearch.Cgen_hmul(&n1, &n2, &n3)
 
 		if m.Ua != 0 {
-			// need to add numerator accounting for overflow
+			// Need to add numerator accounting for overflow.
 			Thearch.Gins(Thearch.Optoas(OADD, nl.Type), &n1, &n3)
 
 			Nodconst(&n2, nl.Type, 1)
@@ -2703,7 +2738,7 @@ func cgen_div(op Op, nl *Node, nr *Node, res *Node) {
 		Thearch.Cgen_hmul(&n1, &n2, &n3)
 
 		if m.Sm < 0 {
-			// need to add numerator
+			// Need to add numerator (cannot overflow).
 			Thearch.Gins(Thearch.Optoas(OADD, nl.Type), &n1, &n3)
 		}
 
@@ -2716,8 +2751,8 @@ func cgen_div(op Op, nl *Node, nr *Node, res *Node) {
 		Thearch.Gins(Thearch.Optoas(OSUB, nl.Type), &n1, &n3) // added
 
 		if m.Sd < 0 {
-			// this could probably be removed
-			// by factoring it into the multiplier
+			// This could probably be removed by factoring it into
+			// the multiplier.
 			Thearch.Gins(Thearch.Optoas(OMINUS, nl.Type), nil, &n3)
 		}
 
@@ -2729,14 +2764,14 @@ func cgen_div(op Op, nl *Node, nr *Node, res *Node) {
 
 	goto longdiv
 
-	// division and mod using (slow) hardware instruction
+	// Division and mod using (slow) hardware instruction.
 longdiv:
 	Thearch.Dodiv(op, nl, nr, res)
 
 	return
 
-	// mod using formula A%B = A-(A/B*B) but
-	// we know that there is a fast algorithm for A/B
+	// Mod using formula A%B = A-(A/B*B) but
+	// we know that there is a fast algorithm for A/B.
 longmod:
 	var n1 Node
 	Regalloc(&n1, nl.Type, res)
@@ -2746,11 +2781,6 @@ longmod:
 	Regalloc(&n2, nl.Type, nil)
 	cgen_div(ODIV, &n1, nr, &n2)
 	a := Thearch.Optoas(OMUL, nl.Type)
-	if w == 8 {
-		// use 2-operand 16-bit multiply
-		// because there is no 2-operand 8-bit multiply
-		a = Thearch.Optoas(OMUL, Types[TINT16]) // XXX was IMULW
-	}
 
 	if !Smallintconst(nr) {
 		var n3 Node