cmd/compile: optimize integer-in-range checks

This CL incorporates code from CL 201206 by Josh Bleecher Snyder (thanks Josh). This CL restores the integer-in-range optimizations in the SSA backend. The fuse pass is enhanced to detect inequalities that could be merged and fuse their associated blocks while the generic rules optimize them into a single unsigned comparison. For example, the inequality `x >= 0 && x < 10` will now be optimized to `unsigned(x) < 10`. Overall has a fairly positive impact on binary sizes. name old time/op new time/op delta Template 192ms ± 1% 192ms ± 1% ~ (p=0.757 n=17+18) Unicode 76.6ms ± 2% 76.5ms ± 2% ~ (p=0.603 n=19+19) GoTypes 694ms ± 1% 693ms ± 1% ~ (p=0.569 n=19+20) Compiler 3.26s ± 0% 3.27s ± 0% +0.25% (p=0.000 n=20+20) SSA 7.41s ± 0% 7.49s ± 0% +1.10% (p=0.000 n=17+19) Flate 120ms ± 1% 120ms ± 1% +0.38% (p=0.003 n=19+19) GoParser 152ms ± 1% 152ms ± 1% ~ (p=0.061 n=17+19) Reflect 422ms ± 1% 425ms ± 2% +0.76% (p=0.001 n=18+20) Tar 167ms ± 1% 167ms ± 0% ~ (p=0.730 n=18+19) XML 233ms ± 4% 231ms ± 1% ~ (p=0.752 n=20+17) LinkCompiler 927ms ± 8% 928ms ± 8% ~ (p=0.857 n=19+20) ExternalLinkCompiler 1.81s ± 2% 1.81s ± 2% ~ (p=0.513 n=19+20) LinkWithoutDebugCompiler 556ms ±10% 583ms ±13% +4.95% (p=0.007 n=20+20) [Geo mean] 478ms 481ms +0.52% name old user-time/op new user-time/op delta Template 270ms ± 5% 269ms ± 7% ~ (p=0.925 n=20+20) Unicode 134ms ± 7% 131ms ±14% ~ (p=0.593 n=18+20) GoTypes 981ms ± 3% 987ms ± 2% +0.63% (p=0.049 n=19+18) Compiler 4.50s ± 2% 4.50s ± 1% ~ (p=0.588 n=19+20) SSA 10.6s ± 2% 10.6s ± 1% ~ (p=0.141 n=20+19) Flate 164ms ± 8% 165ms ±10% ~ (p=0.738 n=20+20) GoParser 202ms ± 5% 203ms ± 6% ~ (p=0.820 n=20+20) Reflect 587ms ± 6% 597ms ± 3% ~ (p=0.087 n=20+18) Tar 230ms ± 6% 228ms ± 8% ~ (p=0.569 n=19+20) XML 311ms ± 6% 314ms ± 5% ~ (p=0.369 n=20+20) LinkCompiler 878ms ± 8% 887ms ± 7% ~ (p=0.289 n=20+20) ExternalLinkCompiler 1.60s ± 7% 1.60s ± 7% ~ (p=0.820 n=20+20) LinkWithoutDebugCompiler 498ms ±12% 489ms ±11% ~ (p=0.398 n=20+20) [Geo mean] 611ms 611ms +0.05% name old alloc/op new alloc/op delta Template 36.1MB ± 0% 36.0MB ± 0% -0.32% (p=0.000 n=20+20) Unicode 28.3MB ± 0% 28.3MB ± 0% -0.03% (p=0.000 n=19+20) GoTypes 121MB ± 0% 121MB ± 0% ~ (p=0.226 n=16+20) Compiler 563MB ± 0% 563MB ± 0% ~ (p=0.166 n=20+19) SSA 1.32GB ± 0% 1.33GB ± 0% +0.88% (p=0.000 n=20+19) Flate 22.7MB ± 0% 22.7MB ± 0% -0.02% (p=0.033 n=19+20) GoParser 27.9MB ± 0% 27.9MB ± 0% -0.02% (p=0.001 n=20+20) Reflect 78.3MB ± 0% 78.2MB ± 0% -0.01% (p=0.019 n=20+20) Tar 34.0MB ± 0% 34.0MB ± 0% -0.04% (p=0.000 n=20+20) XML 43.9MB ± 0% 43.9MB ± 0% -0.07% (p=0.000 n=20+19) LinkCompiler 205MB ± 0% 205MB ± 0% +0.44% (p=0.000 n=20+18) ExternalLinkCompiler 223MB ± 0% 223MB ± 0% +0.03% (p=0.000 n=20+20) LinkWithoutDebugCompiler 139MB ± 0% 142MB ± 0% +1.75% (p=0.000 n=20+20) [Geo mean] 93.7MB 93.9MB +0.20% name old allocs/op new allocs/op delta Template 363k ± 0% 361k ± 0% -0.58% (p=0.000 n=20+19) Unicode 329k ± 0% 329k ± 0% -0.06% (p=0.000 n=19+20) GoTypes 1.28M ± 0% 1.28M ± 0% -0.01% (p=0.000 n=20+20) Compiler 5.40M ± 0% 5.40M ± 0% -0.01% (p=0.000 n=20+20) SSA 12.7M ± 0% 12.8M ± 0% +0.80% (p=0.000 n=20+20) Flate 228k ± 0% 228k ± 0% ~ (p=0.194 n=20+20) GoParser 295k ± 0% 295k ± 0% -0.04% (p=0.000 n=20+20) Reflect 949k ± 0% 949k ± 0% -0.01% (p=0.000 n=20+20) Tar 337k ± 0% 337k ± 0% -0.06% (p=0.000 n=20+20) XML 418k ± 0% 417k ± 0% -0.17% (p=0.000 n=20+20) LinkCompiler 553k ± 0% 554k ± 0% +0.22% (p=0.000 n=20+19) ExternalLinkCompiler 1.52M ± 0% 1.52M ± 0% +0.27% (p=0.000 n=20+20) LinkWithoutDebugCompiler 186k ± 0% 186k ± 0% +0.06% (p=0.000 n=20+20) [Geo mean] 723k 723k +0.03% name old text-bytes new text-bytes delta HelloSize 828kB ± 0% 828kB ± 0% -0.01% (p=0.000 n=20+20) name old data-bytes new data-bytes delta HelloSize 13.4kB ± 0% 13.4kB ± 0% ~ (all equal) name old bss-bytes new bss-bytes delta HelloSize 180kB ± 0% 180kB ± 0% ~ (all equal) name old exe-bytes new exe-bytes delta HelloSize 1.23MB ± 0% 1.23MB ± 0% -0.33% (p=0.000 n=20+20) file before after Δ % addr2line 4320075 4311883 -8192 -0.190% asm 5191932 5187836 -4096 -0.079% buildid 2835338 2831242 -4096 -0.144% compile 20531717 20569099 +37382 +0.182% cover 5322511 5318415 -4096 -0.077% dist 3723749 3719653 -4096 -0.110% doc 4743515 4739419 -4096 -0.086% fix 3413960 3409864 -4096 -0.120% link 6690119 6686023 -4096 -0.061% nm 4269616 4265520 -4096 -0.096% pprof 14942189 14929901 -12288 -0.082% trace 11807164 11790780 -16384 -0.139% vet 8384104 8388200 +4096 +0.049% go 15339076 15334980 -4096 -0.027% total 132258257 132226007 -32250 -0.024% Fixes #30645. Change-Id: If551ac5996097f3685870d083151b5843170aab0 Reviewed-on: https://go-review.googlesource.com/c/go/+/165998 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2024-11-23 18:50:05 -07:00 · 2019-05-20 11:55:56 -07:00 · 2019-05-20 11:55:56 -07:00 · e37cc29863
commit e37cc29863
parent c9ece81cc8
9 changed files with 3080 additions and 27 deletions
--- a/src/cmd/compile/internal/ssa/branchelim.go
+++ b/src/cmd/compile/internal/ssa/branchelim.go
@ -148,7 +148,7 @@ func elimIf(f *Func, loadAddr *sparseSet, dom *Block) bool {
 	// the number of useless instructions executed.
 	const maxfuseinsts = 2

-	if len(simple.Values) > maxfuseinsts || !allTrivial(simple) {
+	if len(simple.Values) > maxfuseinsts || !canSpeculativelyExecute(simple) {
 		return false
 	}

@ -305,10 +305,10 @@ func elimIfElse(f *Func, loadAddr *sparseSet, b *Block) bool {
 		return false
 	}
 	yes, no := b.Succs[0].Block(), b.Succs[1].Block()
-	if !isLeafPlain(yes) || len(yes.Values) > 1 || !allTrivial(yes) {
+	if !isLeafPlain(yes) || len(yes.Values) > 1 || !canSpeculativelyExecute(yes) {
 		return false
 	}
-	if !isLeafPlain(no) || len(no.Values) > 1 || !allTrivial(no) {
+	if !isLeafPlain(no) || len(no.Values) > 1 || !canSpeculativelyExecute(no) {
 		return false
 	}
 	if b.Succs[0].Block().Succs[0].Block() != b.Succs[1].Block().Succs[0].Block() {
@ -415,7 +415,15 @@ func shouldElimIfElse(no, yes, post *Block, arch string) bool {
 	}
 }

-func allTrivial(b *Block) bool {
+// canSpeculativelyExecute reports whether every value in the block can
+// be evaluated without causing any observable side effects (memory
+// accesses, panics and so on) except for execution time changes. It
+// also ensures that the block does not contain any phis which we can't
+// speculatively execute.
+// Warning: this function cannot currently detect values that represent
+// instructions the execution of which need to be guarded with CPU
+// hardware feature checks. See issue #34950.
+func canSpeculativelyExecute(b *Block) bool {
 	// don't fuse memory ops, Phi ops, divides (can panic),
 	// or anything else with side-effects
 	for _, v := range b.Values {
--- a/src/cmd/compile/internal/ssa/compile.go
+++ b/src/cmd/compile/internal/ssa/compile.go
@ -428,7 +428,7 @@ var passes = [...]pass{
 	{name: "gcse deadcode", fn: deadcode, required: true}, // clean out after cse and phiopt
 	{name: "nilcheckelim", fn: nilcheckelim},
 	{name: "prove", fn: prove},
-	{name: "fuse plain", fn: fusePlain},
+	{name: "early fuse", fn: fuseEarly},
 	{name: "decompose builtin", fn: decomposeBuiltIn, required: true},
 	{name: "softfloat", fn: softfloat, required: true},
 	{name: "late opt", fn: opt, required: true}, // TODO: split required rules and optimizing rules
@ -436,7 +436,7 @@ var passes = [...]pass{
 	{name: "generic deadcode", fn: deadcode, required: true}, // remove dead stores, which otherwise mess up store chain
 	{name: "check bce", fn: checkbce},
 	{name: "branchelim", fn: branchelim},
-	{name: "fuse", fn: fuseAll},
+	{name: "late fuse", fn: fuseLate},
 	{name: "dse", fn: dse},
 	{name: "writebarrier", fn: writebarrier, required: true}, // expand write barrier ops
 	{name: "insert resched checks", fn: insertLoopReschedChecks,
@ -491,7 +491,7 @@ var passOrder = [...]constraint{
 	// allow deadcode to clean up after nilcheckelim
 	{"nilcheckelim", "generic deadcode"},
 	// nilcheckelim generates sequences of plain basic blocks
-	{"nilcheckelim", "fuse"},
+	{"nilcheckelim", "late fuse"},
 	// nilcheckelim relies on opt to rewrite user nil checks
 	{"opt", "nilcheckelim"},
 	// tighten will be most effective when as many values have been removed as possible
--- a/src/cmd/compile/internal/ssa/fuse.go
+++ b/src/cmd/compile/internal/ssa/fuse.go
@ -8,18 +8,18 @@ import (
 	"cmd/internal/src"
 )

-// fusePlain runs fuse(f, fuseTypePlain).
-func fusePlain(f *Func) { fuse(f, fuseTypePlain) }
+// fuseEarly runs fuse(f, fuseTypePlain|fuseTypeIntInRange).
+func fuseEarly(f *Func) { fuse(f, fuseTypePlain|fuseTypeIntInRange) }

-// fuseAll runs fuse(f, fuseTypeAll).
-func fuseAll(f *Func) { fuse(f, fuseTypeAll) }
+// fuseLate runs fuse(f, fuseTypePlain|fuseTypeIf).
+func fuseLate(f *Func) { fuse(f, fuseTypePlain|fuseTypeIf) }

 type fuseType uint8

 const (
 	fuseTypePlain fuseType = 1 << iota
 	fuseTypeIf
-	fuseTypeAll = fuseTypePlain | fuseTypeIf
+	fuseTypeIntInRange
 )

 // fuse simplifies control flow by joining basic blocks.
@ -32,6 +32,9 @@ func fuse(f *Func, typ fuseType) {
 			if typ&fuseTypeIf != 0 {
 				changed = fuseBlockIf(b) || changed
 			}
+			if typ&fuseTypeIntInRange != 0 {
+				changed = fuseIntegerComparisons(b) || changed
+			}
 			if typ&fuseTypePlain != 0 {
 				changed = fuseBlockPlain(b) || changed
 			}
--- a/src/cmd/compile/internal/ssa/fuse_comparisons.go
+++ b/src/cmd/compile/internal/ssa/fuse_comparisons.go
@ -0,0 +1,157 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package ssa
+
+// fuseIntegerComparisons optimizes inequalities such as '1 <= x && x < 5',
+// which can be optimized to 'unsigned(x-1) < 4'.
+//
+// Look for branch structure like:
+//
+//   p
+//   |\
+//   | b
+//   |/ \
+//   s0 s1
+//
+// In our example, p has control '1 <= x', b has control 'x < 5',
+// and s0 and s1 are the if and else results of the comparison.
+//
+// This will be optimized into:
+//
+//   p
+//    \
+//     b
+//    / \
+//   s0 s1
+//
+// where b has the combined control value 'unsigned(x-1) < 4'.
+// Later passes will then fuse p and b.
+func fuseIntegerComparisons(b *Block) bool {
+	if len(b.Preds) != 1 {
+		return false
+	}
+	p := b.Preds[0].Block()
+	if b.Kind != BlockIf || p.Kind != BlockIf {
+		return false
+	}
+
+	// Don't merge control values if b is likely to be bypassed anyway.
+	if p.Likely == BranchLikely && p.Succs[0].Block() != b {
+		return false
+	}
+	if p.Likely == BranchUnlikely && p.Succs[1].Block() != b {
+		return false
+	}
+
+	// Check if the control values combine to make an integer inequality that
+	// can be further optimized later.
+	bc := b.Controls[0]
+	pc := p.Controls[0]
+	if !areMergeableInequalities(bc, pc) {
+		return false
+	}
+
+	// If the first (true) successors match then we have a disjunction (||).
+	// If the second (false) successors match then we have a conjunction (&&).
+	for i, op := range [2]Op{OpOrB, OpAndB} {
+		if p.Succs[i].Block() != b.Succs[i].Block() {
+			continue
+		}
+
+		// TODO(mundaym): should we also check the cost of executing b?
+		// Currently we might speculatively execute b even if b contains
+		// a lot of instructions. We could just check that len(b.Values)
+		// is lower than a fixed amount. Bear in mind however that the
+		// other optimization passes might yet reduce the cost of b
+		// significantly so we shouldn't be overly conservative.
+		if !canSpeculativelyExecute(b) {
+			return false
+		}
+
+		// Logically combine the control values for p and b.
+		v := b.NewValue0(bc.Pos, op, bc.Type)
+		v.AddArg(pc)
+		v.AddArg(bc)
+
+		// Set the combined control value as the control value for b.
+		b.SetControl(v)
+
+		// Modify p so that it jumps directly to b.
+		p.removeEdge(i)
+		p.Kind = BlockPlain
+		p.Likely = BranchUnknown
+		p.ResetControls()
+
+		return true
+	}
+
+	// TODO: could negate condition(s) to merge controls.
+	return false
+}
+
+// getConstIntArgIndex returns the index of the first argument that is a
+// constant integer or -1 if no such argument exists.
+func getConstIntArgIndex(v *Value) int {
+	for i, a := range v.Args {
+		switch a.Op {
+		case OpConst8, OpConst16, OpConst32, OpConst64:
+			return i
+		}
+	}
+	return -1
+}
+
+// isSignedInequality reports whether op represents the inequality < or ≤
+// in the signed domain.
+func isSignedInequality(v *Value) bool {
+	switch v.Op {
+	case OpLess64, OpLess32, OpLess16, OpLess8,
+		OpLeq64, OpLeq32, OpLeq16, OpLeq8:
+		return true
+	}
+	return false
+}
+
+// isUnsignedInequality reports whether op represents the inequality < or ≤
+// in the unsigned domain.
+func isUnsignedInequality(v *Value) bool {
+	switch v.Op {
+	case OpLess64U, OpLess32U, OpLess16U, OpLess8U,
+		OpLeq64U, OpLeq32U, OpLeq16U, OpLeq8U:
+		return true
+	}
+	return false
+}
+
+func areMergeableInequalities(x, y *Value) bool {
+	// We need both inequalities to be either in the signed or unsigned domain.
+	// TODO(mundaym): it would also be good to merge when we have an Eq op that
+	// could be transformed into a Less/Leq. For example in the unsigned
+	// domain 'x == 0 || 3 < x' is equivalent to 'x <= 0 || 3 < x'
+	inequalityChecks := [...]func(*Value) bool{
+		isSignedInequality,
+		isUnsignedInequality,
+	}
+	for _, f := range inequalityChecks {
+		if !f(x) || !f(y) {
+			continue
+		}
+
+		// Check that both inequalities are comparisons with constants.
+		xi := getConstIntArgIndex(x)
+		if xi < 0 {
+			return false
+		}
+		yi := getConstIntArgIndex(y)
+		if yi < 0 {
+			return false
+		}
+
+		// Check that the non-constant arguments to the inequalities
+		// are the same.
+		return x.Args[xi^1] == y.Args[yi^1]
+	}
+	return false
+}
--- a/src/cmd/compile/internal/ssa/fuse_test.go
+++ b/src/cmd/compile/internal/ssa/fuse_test.go
@ -26,7 +26,7 @@ func TestFuseEliminatesOneBranch(t *testing.T) {
 			Exit("mem")))

 	CheckFunc(fun.f)
-	fuseAll(fun.f)
+	fuseLate(fun.f)

 	for _, b := range fun.f.Blocks {
 		if b == fun.blocks["then"] && b.Kind != BlockInvalid {
@ -56,7 +56,7 @@ func TestFuseEliminatesBothBranches(t *testing.T) {
 			Exit("mem")))

 	CheckFunc(fun.f)
-	fuseAll(fun.f)
+	fuseLate(fun.f)

 	for _, b := range fun.f.Blocks {
 		if b == fun.blocks["then"] && b.Kind != BlockInvalid {
@ -90,7 +90,7 @@ func TestFuseHandlesPhis(t *testing.T) {
 			Exit("mem")))

 	CheckFunc(fun.f)
-	fuseAll(fun.f)
+	fuseLate(fun.f)

 	for _, b := range fun.f.Blocks {
 		if b == fun.blocks["then"] && b.Kind != BlockInvalid {
@ -122,7 +122,7 @@ func TestFuseEliminatesEmptyBlocks(t *testing.T) {
 		))

 	CheckFunc(fun.f)
-	fuseAll(fun.f)
+	fuseLate(fun.f)

 	for k, b := range fun.blocks {
 		if k[:1] == "z" && b.Kind != BlockInvalid {
@ -153,7 +153,7 @@ func TestFuseSideEffects(t *testing.T) {
 			Goto("loop")))

 	CheckFunc(fun.f)
-	fuseAll(fun.f)
+	fuseLate(fun.f)

 	for _, b := range fun.f.Blocks {
 		if b == fun.blocks["then"] && b.Kind == BlockInvalid {
@ -196,7 +196,7 @@ func BenchmarkFuse(b *testing.B) {
 			b.ResetTimer()
 			for i := 0; i < b.N; i++ {
 				fun := c.Fun("entry", blocks...)
-				fuseAll(fun.f)
+				fuseLate(fun.f)
 			}
 		})
 	}
--- a/src/cmd/compile/internal/ssa/gen/generic.rules
+++ b/src/cmd/compile/internal/ssa/gen/generic.rules
@ -254,6 +254,54 @@
 (Neq16 (Const16 <t> [c]) (Add16 (Const16 <t> [d]) x)) -> (Neq16 (Const16 <t> [int64(int16(c-d))]) x)
 (Neq8  (Const8  <t> [c]) (Add8  (Const8  <t> [d]) x)) -> (Neq8 (Const8 <t> [int64(int8(c-d))]) x)

+// signed integer range: ( c <= x && x (<|<=) d ) -> ( unsigned(x-c) (<|<=) unsigned(d-c) )
+(AndB (Leq64 (Const64 [c]) x) ((Less|Leq)64 x (Const64 [d]))) && d >= c -> ((Less|Leq)64U (Sub64 <x.Type> x (Const64 <x.Type> [c])) (Const64 <x.Type> [d-c]))
+(AndB (Leq32 (Const32 [c]) x) ((Less|Leq)32 x (Const32 [d]))) && d >= c -> ((Less|Leq)32U (Sub32 <x.Type> x (Const32 <x.Type> [c])) (Const32 <x.Type> [d-c]))
+(AndB (Leq16 (Const16 [c]) x) ((Less|Leq)16 x (Const16 [d]))) && d >= c -> ((Less|Leq)16U (Sub16 <x.Type> x (Const16 <x.Type> [c])) (Const16 <x.Type> [d-c]))
+(AndB (Leq8  (Const8  [c]) x) ((Less|Leq)8  x (Const8  [d]))) && d >= c -> ((Less|Leq)8U  (Sub8  <x.Type> x (Const8  <x.Type> [c])) (Const8  <x.Type> [d-c]))
+
+// signed integer range: ( c < x && x (<|<=) d ) -> ( unsigned(x-(c+1)) (<|<=) unsigned(d-(c+1)) )
+(AndB (Less64 (Const64 [c]) x) ((Less|Leq)64 x (Const64 [d]))) && d >= c+1 && int64(c+1) > int64(c) -> ((Less|Leq)64U (Sub64 <x.Type> x (Const64 <x.Type> [c+1])) (Const64 <x.Type> [d-c-1]))
+(AndB (Less32 (Const32 [c]) x) ((Less|Leq)32 x (Const32 [d]))) && d >= c+1 && int32(c+1) > int32(c) -> ((Less|Leq)32U (Sub32 <x.Type> x (Const32 <x.Type> [c+1])) (Const32 <x.Type> [d-c-1]))
+(AndB (Less16 (Const16 [c]) x) ((Less|Leq)16 x (Const16 [d]))) && d >= c+1 && int16(c+1) > int16(c) -> ((Less|Leq)16U (Sub16 <x.Type> x (Const16 <x.Type> [c+1])) (Const16 <x.Type> [d-c-1]))
+(AndB (Less8  (Const8  [c]) x) ((Less|Leq)8  x (Const8  [d]))) && d >= c+1 && int8(c+1)  > int8(c)  -> ((Less|Leq)8U  (Sub8  <x.Type> x (Const8  <x.Type> [c+1])) (Const8  <x.Type> [d-c-1]))
+
+// unsigned integer range: ( c <= x && x (<|<=) d ) -> ( x-c (<|<=) d-c )
+(AndB (Leq64U (Const64 [c]) x) ((Less|Leq)64U x (Const64 [d]))) && uint64(d) >= uint64(c) -> ((Less|Leq)64U (Sub64 <x.Type> x (Const64 <x.Type> [c])) (Const64 <x.Type> [d-c]))
+(AndB (Leq32U (Const32 [c]) x) ((Less|Leq)32U x (Const32 [d]))) && uint32(d) >= uint32(c) -> ((Less|Leq)32U (Sub32 <x.Type> x (Const32 <x.Type> [c])) (Const32 <x.Type> [int64(int32(d-c))]))
+(AndB (Leq16U (Const16 [c]) x) ((Less|Leq)16U x (Const16 [d]))) && uint16(d) >= uint16(c) -> ((Less|Leq)16U (Sub16 <x.Type> x (Const16 <x.Type> [c])) (Const16 <x.Type> [int64(int16(d-c))]))
+(AndB (Leq8U  (Const8  [c]) x) ((Less|Leq)8U  x (Const8  [d]))) && uint8(d)  >= uint8(c)  -> ((Less|Leq)8U  (Sub8  <x.Type> x (Const8  <x.Type> [c])) (Const8  <x.Type> [int64(int8(d-c))]))
+
+// unsigned integer range: ( c < x && x (<|<=) d ) -> ( x-(c+1) (<|<=) d-(c+1) )
+(AndB (Less64U (Const64 [c]) x) ((Less|Leq)64U x (Const64 [d]))) && uint64(d) >= uint64(c+1) && uint64(c+1) > uint64(c) -> ((Less|Leq)64U (Sub64 <x.Type> x (Const64 <x.Type> [c+1])) (Const64 <x.Type> [d-c-1]))
+(AndB (Less32U (Const32 [c]) x) ((Less|Leq)32U x (Const32 [d]))) && uint32(d) >= uint32(c+1) && uint32(c+1) > uint32(c) -> ((Less|Leq)32U (Sub32 <x.Type> x (Const32 <x.Type> [int64(int32(c+1))])) (Const32 <x.Type> [int64(int32(d-c-1))]))
+(AndB (Less16U (Const16 [c]) x) ((Less|Leq)16U x (Const16 [d]))) && uint16(d) >= uint16(c+1) && uint16(c+1) > uint16(c) -> ((Less|Leq)16U (Sub16 <x.Type> x (Const16 <x.Type> [int64(int16(c+1))])) (Const16 <x.Type> [int64(int16(d-c-1))]))
+(AndB (Less8U  (Const8  [c]) x) ((Less|Leq)8U  x (Const8  [d]))) && uint8(d)  >= uint8(c+1)  && uint8(c+1)  > uint8(c)  -> ((Less|Leq)8U  (Sub8  <x.Type> x (Const8  <x.Type> [int64(int8(c+1))]))  (Const8  <x.Type> [int64(int8(d-c-1))]))
+
+// signed integer range: ( c (<|<=) x || x < d ) -> ( unsigned(c-d) (<|<=) unsigned(x-d) )
+(OrB ((Less|Leq)64 (Const64 [c]) x) (Less64 x (Const64 [d]))) && c >= d -> ((Less|Leq)64U (Const64 <x.Type> [c-d]) (Sub64 <x.Type> x (Const64 <x.Type> [d])))
+(OrB ((Less|Leq)32 (Const32 [c]) x) (Less32 x (Const32 [d]))) && c >= d -> ((Less|Leq)32U (Const32 <x.Type> [c-d]) (Sub32 <x.Type> x (Const32 <x.Type> [d])))
+(OrB ((Less|Leq)16 (Const16 [c]) x) (Less16 x (Const16 [d]))) && c >= d -> ((Less|Leq)16U (Const16 <x.Type> [c-d]) (Sub16 <x.Type> x (Const16 <x.Type> [d])))
+(OrB ((Less|Leq)8  (Const8  [c]) x) (Less8  x (Const8  [d]))) && c >= d -> ((Less|Leq)8U  (Const8  <x.Type> [c-d]) (Sub8  <x.Type> x (Const8  <x.Type> [d])))
+
+// signed integer range: ( c (<|<=) x || x <= d ) -> ( unsigned(c-(d+1)) (<|<=) unsigned(x-(d+1)) )
+(OrB ((Less|Leq)64 (Const64 [c]) x) (Leq64 x (Const64 [d]))) && c >= d+1 && int64(d+1) > int64(d) -> ((Less|Leq)64U (Const64 <x.Type> [c-d-1]) (Sub64 <x.Type> x (Const64 <x.Type> [d+1])))
+(OrB ((Less|Leq)32 (Const32 [c]) x) (Leq32 x (Const32 [d]))) && c >= d+1 && int32(d+1) > int32(d) -> ((Less|Leq)32U (Const32 <x.Type> [c-d-1]) (Sub32 <x.Type> x (Const32 <x.Type> [d+1])))
+(OrB ((Less|Leq)16 (Const16 [c]) x) (Leq16 x (Const16 [d]))) && c >= d+1 && int16(d+1) > int16(d) -> ((Less|Leq)16U (Const16 <x.Type> [c-d-1]) (Sub16 <x.Type> x (Const16 <x.Type> [d+1])))
+(OrB ((Less|Leq)8  (Const8  [c]) x) (Leq8  x (Const8  [d]))) && c >= d+1 && int8(d+1)  > int8(d)  -> ((Less|Leq)8U  (Const8  <x.Type> [c-d-1]) (Sub8  <x.Type> x (Const8  <x.Type> [d+1])))
+
+// unsigned integer range: ( c (<|<=) x || x < d ) -> ( c-d (<|<=) x-d )
+(OrB ((Less|Leq)64U (Const64 [c]) x) (Less64U x (Const64 [d]))) && uint64(c) >= uint64(d) -> ((Less|Leq)64U (Const64 <x.Type>               [c-d]) (Sub64 <x.Type> x (Const64 <x.Type> [d])))
+(OrB ((Less|Leq)32U (Const32 [c]) x) (Less32U x (Const32 [d]))) && uint32(c) >= uint32(d) -> ((Less|Leq)32U (Const32 <x.Type> [int64(int32(c-d))]) (Sub32 <x.Type> x (Const32 <x.Type> [d])))
+(OrB ((Less|Leq)16U (Const16 [c]) x) (Less16U x (Const16 [d]))) && uint16(c) >= uint16(d) -> ((Less|Leq)16U (Const16 <x.Type> [int64(int16(c-d))]) (Sub16 <x.Type> x (Const16 <x.Type> [d])))
+(OrB ((Less|Leq)8U  (Const8  [c]) x) (Less8U  x (Const8  [d]))) && uint8(c)  >= uint8(d)  -> ((Less|Leq)8U  (Const8  <x.Type> [int64( int8(c-d))]) (Sub8  <x.Type> x (Const8  <x.Type> [d])))
+
+// unsigned integer range: ( c (<|<=) x || x <= d ) -> ( c-(d+1) (<|<=) x-(d+1) )
+(OrB ((Less|Leq)64U (Const64 [c]) x) (Leq64U x (Const64 [d]))) && uint64(c) >= uint64(d+1) && uint64(d+1) > uint64(d) -> ((Less|Leq)64U (Const64 <x.Type>               [c-d-1]) (Sub64 <x.Type> x (Const64 <x.Type> [d+1])))
+(OrB ((Less|Leq)32U (Const32 [c]) x) (Leq32U x (Const32 [d]))) && uint32(c) >= uint32(d+1) && uint32(d+1) > uint32(d) -> ((Less|Leq)32U (Const32 <x.Type> [int64(int32(c-d-1))]) (Sub32 <x.Type> x (Const32 <x.Type> [int64(int32(d+1))])))
+(OrB ((Less|Leq)16U (Const16 [c]) x) (Leq16U x (Const16 [d]))) && uint16(c) >= uint16(d+1) && uint16(d+1) > uint16(d) -> ((Less|Leq)16U (Const16 <x.Type> [int64(int16(c-d-1))]) (Sub16 <x.Type> x (Const16 <x.Type> [int64(int16(d+1))])))
+(OrB ((Less|Leq)8U  (Const8  [c]) x) (Leq8U  x (Const8  [d]))) && uint8(c)  >= uint8(d+1)  && uint8(d+1)  > uint8(d)  -> ((Less|Leq)8U  (Const8  <x.Type> [int64( int8(c-d-1))]) (Sub8  <x.Type> x (Const8  <x.Type> [int64( int8(d+1))])))
+
 // Canonicalize x-const to x+(-const)
 (Sub64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Add64 (Const64 <t> [-c]) x)
 (Sub32 x (Const32 <t> [c])) && x.Op != OpConst32 -> (Add32 (Const32 <t> [int64(int32(-c))]) x)
--- a/src/cmd/compile/internal/ssa/nilcheck_test.go
+++ b/src/cmd/compile/internal/ssa/nilcheck_test.go
@ -87,7 +87,7 @@ func TestNilcheckSimple(t *testing.T) {
 	nilcheckelim(fun.f)

 	// clean up the removed nil check
-	fusePlain(fun.f)
+	fuse(fun.f, fuseTypePlain)
 	deadcode(fun.f)

 	CheckFunc(fun.f)
@ -124,7 +124,7 @@ func TestNilcheckDomOrder(t *testing.T) {
 	nilcheckelim(fun.f)

 	// clean up the removed nil check
-	fusePlain(fun.f)
+	fuse(fun.f, fuseTypePlain)
 	deadcode(fun.f)

 	CheckFunc(fun.f)
@ -157,7 +157,7 @@ func TestNilcheckAddr(t *testing.T) {
 	nilcheckelim(fun.f)

 	// clean up the removed nil check
-	fusePlain(fun.f)
+	fuse(fun.f, fuseTypePlain)
 	deadcode(fun.f)

 	CheckFunc(fun.f)
@ -191,7 +191,7 @@ func TestNilcheckAddPtr(t *testing.T) {
 	nilcheckelim(fun.f)

 	// clean up the removed nil check
-	fusePlain(fun.f)
+	fuse(fun.f, fuseTypePlain)
 	deadcode(fun.f)

 	CheckFunc(fun.f)
@ -235,7 +235,7 @@ func TestNilcheckPhi(t *testing.T) {
 	nilcheckelim(fun.f)

 	// clean up the removed nil check
-	fusePlain(fun.f)
+	fuse(fun.f, fuseTypePlain)
 	deadcode(fun.f)

 	CheckFunc(fun.f)
@ -276,7 +276,7 @@ func TestNilcheckKeepRemove(t *testing.T) {
 	nilcheckelim(fun.f)

 	// clean up the removed nil check
-	fusePlain(fun.f)
+	fuse(fun.f, fuseTypePlain)
 	deadcode(fun.f)

 	CheckFunc(fun.f)
@ -323,7 +323,7 @@ func TestNilcheckInFalseBranch(t *testing.T) {
 	nilcheckelim(fun.f)

 	// clean up the removed nil check
-	fusePlain(fun.f)
+	fuse(fun.f, fuseTypePlain)
 	deadcode(fun.f)

 	CheckFunc(fun.f)
@ -374,7 +374,7 @@ func TestNilcheckUser(t *testing.T) {
 	nilcheckelim(fun.f)

 	// clean up the removed nil check
-	fusePlain(fun.f)
+	fuse(fun.f, fuseTypePlain)
 	deadcode(fun.f)

 	CheckFunc(fun.f)
@ -418,7 +418,7 @@ func TestNilcheckBug(t *testing.T) {
 	nilcheckelim(fun.f)

 	// clean up the removed nil check
-	fusePlain(fun.f)
+	fuse(fun.f, fuseTypePlain)
 	deadcode(fun.f)

 	CheckFunc(fun.f)
--- a/src/cmd/compile/internal/ssa/rewritegeneric.go
+++ b/src/cmd/compile/internal/ssa/rewritegeneric.go
--- a/test/codegen/fuse.go
+++ b/test/codegen/fuse.go
@ -0,0 +1,197 @@
+// asmcheck
+
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package codegen
+
+// Notes:
+// - these examples use channels to provide a source of
+//   unknown values that cannot be optimized away
+// - these examples use for loops to force branches
+//   backward (predicted taken)
+
+// ---------------------------------- //
+// signed integer range (conjunction) //
+// ---------------------------------- //
+
+func si1c(c <-chan int64) {
+	// amd64:"CMPQ\t.+, [$]256"
+	// s390x:"CLGIJ\t[$]12, R[0-9]+, [$]255"
+	for x := <-c; x >= 0 && x < 256; x = <-c {
+	}
+}
+
+func si2c(c <-chan int32) {
+	// amd64:"CMPL\t.+, [$]256"
+	// s390x:"CLIJ\t[$]12, R[0-9]+, [$]255"
+	for x := <-c; x >= 0 && x < 256; x = <-c {
+	}
+}
+
+func si3c(c <-chan int16) {
+	// amd64:"CMPW\t.+, [$]256"
+	// s390x:"CLIJ\t[$]12, R[0-9]+, [$]255"
+	for x := <-c; x >= 0 && x < 256; x = <-c {
+	}
+}
+
+func si4c(c <-chan int8) {
+	// amd64:"CMPB\t.+, [$]10"
+	// s390x:"CLIJ\t[$]4, R[0-9]+, [$]10"
+	for x := <-c; x >= 0 && x < 10; x = <-c {
+	}
+}
+
+func si5c(c <-chan int64) {
+	// amd64:"CMPQ\t.+, [$]251","ADDQ\t[$]-5,"
+	// s390x:"CLGIJ\t[$]4, R[0-9]+, [$]251","ADD\t[$]-5,"
+	for x := <-c; x < 256 && x > 4; x = <-c {
+	}
+}
+
+func si6c(c <-chan int32) {
+	// amd64:"CMPL\t.+, [$]255","DECL\t"
+	// s390x:"CLIJ\t[$]12, R[0-9]+, [$]255","ADDW\t[$]-1,"
+	for x := <-c; x > 0 && x <= 256; x = <-c {
+	}
+}
+
+func si7c(c <-chan int16) {
+	// amd64:"CMPW\t.+, [$]60","ADDL\t[$]10,"
+	// s390x:"CLIJ\t[$]12, R[0-9]+, [$]60","ADDW\t[$]10,"
+	for x := <-c; x >= -10 && x <= 50; x = <-c {
+	}
+}
+
+func si8c(c <-chan int8) {
+	// amd64:"CMPB\t.+, [$]126","ADDL\t[$]126,"
+	// s390x:"CLIJ\t[$]4, R[0-9]+, [$]126","ADDW\t[$]126,"
+	for x := <-c; x >= -126 && x < 0; x = <-c {
+	}
+}
+
+// ---------------------------------- //
+// signed integer range (disjunction) //
+// ---------------------------------- //
+
+func si1d(c <-chan int64) {
+	// amd64:"CMPQ\t.+, [$]256"
+	// s390x:"CLGIJ\t[$]2, R[0-9]+, [$]255"
+	for x := <-c; x < 0 || x >= 256; x = <-c {
+	}
+}
+
+func si2d(c <-chan int32) {
+	// amd64:"CMPL\t.+, [$]256"
+	// s390x:"CLIJ\t[$]2, R[0-9]+, [$]255"
+	for x := <-c; x < 0 || x >= 256; x = <-c {
+	}
+}
+
+func si3d(c <-chan int16) {
+	// amd64:"CMPW\t.+, [$]256"
+	// s390x:"CLIJ\t[$]2, R[0-9]+, [$]255"
+	for x := <-c; x < 0 || x >= 256; x = <-c {
+	}
+}
+
+func si4d(c <-chan int8) {
+	// amd64:"CMPB\t.+, [$]10"
+	// s390x:"CLIJ\t[$]10, R[0-9]+, [$]10"
+	for x := <-c; x < 0 || x >= 10; x = <-c {
+	}
+}
+
+func si5d(c <-chan int64) {
+	// amd64:"CMPQ\t.+, [$]251","ADDQ\t[$]-5,"
+	// s390x:"CLGIJ\t[$]10, R[0-9]+, [$]251","ADD\t[$]-5,"
+	for x := <-c; x >= 256 || x <= 4; x = <-c {
+	}
+}
+
+func si6d(c <-chan int32) {
+	// amd64:"CMPL\t.+, [$]255","DECL\t"
+	// s390x:"CLIJ\t[$]2, R[0-9]+, [$]255","ADDW\t[$]-1,"
+	for x := <-c; x <= 0 || x > 256; x = <-c {
+	}
+}
+
+func si7d(c <-chan int16) {
+	// amd64:"CMPW\t.+, [$]60","ADDL\t[$]10,"
+	// s390x:"CLIJ\t[$]2, R[0-9]+, [$]60","ADDW\t[$]10,"
+	for x := <-c; x < -10 || x > 50; x = <-c {
+	}
+}
+
+func si8d(c <-chan int8) {
+	// amd64:"CMPB\t.+, [$]126","ADDL\t[$]126,"
+	// s390x:"CLIJ\t[$]10, R[0-9]+, [$]126","ADDW\t[$]126,"
+	for x := <-c; x < -126 || x >= 0; x = <-c {
+	}
+}
+
+// ------------------------------------ //
+// unsigned integer range (conjunction) //
+// ------------------------------------ //
+
+func ui1c(c <-chan uint64) {
+	// amd64:"CMPQ\t.+, [$]251","ADDQ\t[$]-5,"
+	// s390x:"CLGIJ\t[$]4, R[0-9]+, [$]251","ADD\t[$]-5,"
+	for x := <-c; x < 256 && x > 4; x = <-c {
+	}
+}
+
+func ui2c(c <-chan uint32) {
+	// amd64:"CMPL\t.+, [$]255","DECL\t"
+	// s390x:"CLIJ\t[$]12, R[0-9]+, [$]255","ADDW\t[$]-1,"
+	for x := <-c; x > 0 && x <= 256; x = <-c {
+	}
+}
+
+func ui3c(c <-chan uint16) {
+	// amd64:"CMPW\t.+, [$]40","ADDL\t[$]-10,"
+	// s390x:"CLIJ\t[$]12, R[0-9]+, [$]40","ADDW\t[$]-10,"
+	for x := <-c; x >= 10 && x <= 50; x = <-c {
+	}
+}
+
+func ui4c(c <-chan uint8) {
+	// amd64:"CMPB\t.+, [$]2","ADDL\t[$]-126,"
+	// s390x:"CLIJ\t[$]4, R[0-9]+, [$]2","ADDW\t[$]-126,"
+	for x := <-c; x >= 126 && x < 128; x = <-c {
+	}
+}
+
+// ------------------------------------ //
+// unsigned integer range (disjunction) //
+// ------------------------------------ //
+
+func ui1d(c <-chan uint64) {
+	// amd64:"CMPQ\t.+, [$]251","ADDQ\t[$]-5,"
+	// s390x:"CLGIJ\t[$]10, R[0-9]+, [$]251","ADD\t[$]-5,"
+	for x := <-c; x >= 256 || x <= 4; x = <-c {
+	}
+}
+
+func ui2d(c <-chan uint32) {
+	// amd64:"CMPL\t.+, [$]254","ADDL\t[$]-2,"
+	// s390x:"CLIJ\t[$]2, R[0-9]+, [$]254","ADDW\t[$]-2,"
+	for x := <-c; x <= 1 || x > 256; x = <-c {
+	}
+}
+
+func ui3d(c <-chan uint16) {
+	// amd64:"CMPW\t.+, [$]40","ADDL\t[$]-10,"
+	// s390x:"CLIJ\t[$]2, R[0-9]+, [$]40","ADDW\t[$]-10,"
+	for x := <-c; x < 10 || x > 50; x = <-c {
+	}
+}
+
+func ui4d(c <-chan uint8) {
+	// amd64:"CMPB\t.+, [$]2","ADDL\t[$]-126,"
+	// s390x:"CLIJ\t[$]10, R[0-9]+, [$]2","ADDW\t[$]-126,"
+	for x := <-c; x < 126 || x >= 128; x = <-c {
+	}
+}