cmd/compile: rework unbounded shift lowering on PPC64

This reduces unbounded shift latency by one cycle, and may generate less instructions in some cases. When there is a choice whether to use doubleword or word shifts, use doubleword shifts. Doubleword shifts have fewer hardware scheduling restrictions across P8/P9/P10. Likewise, rework the shift sequence to allow the compare/shift/overshift values to compute in parallel, then choose the correct value. Some ANDCCconst rules also need reworked to ensure they simplify when used for their flag value. This commonly occurs when prove fails to identify a bounded shift (e.g foo32<<uint(x&31)). Change-Id: Ifc6ff4a865d68675e57745056db414b0eb6f2d34 Reviewed-on: https://go-review.googlesource.com/c/go/+/442597 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com> Reviewed-by: Than McIntosh <thanm@google.com> Run-TryBot: Paul Murphy <murp@ibm.com> Reviewed-by: Ian Lance Taylor <iant@google.com>
2024-09-28 23:14:38 -06:00 · 2022-09-27 13:13:10 -05:00 · 2022-09-27 13:13:10 -05:00 · f98dd29910
commit f98dd29910
parent f60a2a9c94
3 changed files with 943 additions and 851 deletions
--- a/src/cmd/compile/internal/ssa/_gen/PPC64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/PPC64.rules
@ -172,61 +172,60 @@
 // Lower bounded shifts first. No need to check shift value.
 (Lsh64x(64|32|16|8)  x y) && shiftIsBounded(v) => (SLD x y)
 (Lsh32x(64|32|16|8)  x y) && shiftIsBounded(v) => (SLW x y)
-(Lsh16x(64|32|16|8)  x y) && shiftIsBounded(v) => (SLW x y)
-(Lsh8x(64|32|16|8)   x y) && shiftIsBounded(v) => (SLW x y)
+(Lsh16x(64|32|16|8)  x y) && shiftIsBounded(v) => (SLD x y)
+(Lsh8x(64|32|16|8)   x y) && shiftIsBounded(v) => (SLD x y)
 (Rsh64Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRD x y)
 (Rsh32Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRW x y)
-(Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRW (MOVHZreg x) y)
-(Rsh8Ux(64|32|16|8)  x y) && shiftIsBounded(v) => (SRW (MOVBZreg x) y)
+(Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRD (MOVHZreg x) y)
+(Rsh8Ux(64|32|16|8)  x y) && shiftIsBounded(v) => (SRD (MOVBZreg x) y)
 (Rsh64x(64|32|16|8)  x y) && shiftIsBounded(v) => (SRAD x y)
 (Rsh32x(64|32|16|8)  x y) && shiftIsBounded(v) => (SRAW x y)
-(Rsh16x(64|32|16|8)  x y) && shiftIsBounded(v) => (SRAW (MOVHreg x) y)
-(Rsh8x(64|32|16|8)   x y) && shiftIsBounded(v) => (SRAW (MOVBreg x) y)
+(Rsh16x(64|32|16|8)  x y) && shiftIsBounded(v) => (SRAD (MOVHreg x) y)
+(Rsh8x(64|32|16|8)   x y) && shiftIsBounded(v) => (SRAD (MOVBreg x) y)

-// non-constant rotates
-// If shift > 64 then use -1 as shift count to shift all bits.
-((Lsh64|Rsh64|Rsh64U)x64 x y)  => (S(L|RA|R)D  x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [64]))))
-((Rsh32|Rsh32U|Lsh32)x64 x y)  => (S(RA|R|L)W x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [32]))))
+// Unbounded shifts. Go shifts saturate to 0 or -1 when shifting beyond the number of
+// bits in a type, PPC64 shifts do not (see the ISA for details).
+//
+// Note, y is always non-negative.
+//
+// Note, ISELZ is intentionally not used in lower. Where possible, ISEL is converted to ISELZ in late lower
+// after all the ISEL folding rules have been exercised.

-(Rsh(16|16U)x64 x y)  => (SR(AW|W) ((Sign|Zero)Ext16to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [16]))))
-(Lsh16x64 x y)  => (SLW  x                 (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [16]))))
+((Rsh64U|Lsh64)x64 <t> x y)  => (ISEL [0] (S(R|L)D <t> x y) (MOVDconst [0])        (CMPUconst y [64]))
+((Rsh64U|Lsh64)x32 <t> x y)  => (ISEL [0] (S(R|L)D <t> x y) (MOVDconst [0])        (CMPWUconst y [64]))
+((Rsh64U|Lsh64)x16 <t> x y)  => (ISEL [2] (S(R|L)D <t> x y) (MOVDconst [0])        (Select1 <types.TypeFlags> (ANDCCconst [0xFFC0] y)))
+((Rsh64U|Lsh64)x8  <t> x y)  => (ISEL [2] (S(R|L)D <t> x y) (MOVDconst [0])        (Select1 <types.TypeFlags> (ANDCCconst [0x00C0] y)))
+(Rsh64x(64|32)     <t> x y)  => (ISEL [0] (SRAD    <t> x y) (SRADconst <t> x [63]) (CMP(U|WU)const y [64]))
+(Rsh64x16          <t> x y)  => (ISEL [2] (SRAD    <t> x y) (SRADconst <t> x [63]) (Select1 <types.TypeFlags> (ANDCCconst [0xFFC0] y)))
+(Rsh64x8           <t> x y)  => (ISEL [2] (SRAD    <t> x y) (SRADconst <t> x [63]) (Select1 <types.TypeFlags> (ANDCCconst [0x00C0] y)))

-(Rsh(8|8U)x64 x y)  => (SR(AW|W) ((Sign|Zero)Ext8to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [8]))))
-(Lsh8x64 x y)  => (SLW  x                (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [8]))))
+((Rsh32U|Lsh32)x64 <t> x y)  => (ISEL [0] (S(R|L)W <t> x y) (MOVDconst [0])        (CMPUconst y [32]))
+((Rsh32U|Lsh32)x32 <t> x y)  => (ISEL [0] (S(R|L)W <t> x y) (MOVDconst [0])        (CMPWUconst y [32]))
+((Rsh32U|Lsh32)x16 <t> x y)  => (ISEL [2] (S(R|L)W <t> x y) (MOVDconst [0])        (Select1 <types.TypeFlags> (ANDCCconst [0xFFE0] y)))
+((Rsh32U|Lsh32)x8  <t> x y)  => (ISEL [2] (S(R|L)W <t> x y) (MOVDconst [0])        (Select1 <types.TypeFlags> (ANDCCconst [0x00E0] y)))
+(Rsh32x(64|32)     <t> x y)  => (ISEL [0] (SRAW    <t> x y) (SRAWconst <t> x [31]) (CMP(U|WU)const y [32]))
+(Rsh32x16          <t> x y)  => (ISEL [2] (SRAW    <t> x y) (SRAWconst <t> x [31]) (Select1 <types.TypeFlags> (ANDCCconst [0xFFE0] y)))
+(Rsh32x8           <t> x y)  => (ISEL [2] (SRAW    <t> x y) (SRAWconst <t> x [31]) (Select1 <types.TypeFlags> (ANDCCconst [0x00E0] y)))

-((Rsh64|Rsh64U|Lsh64)x32 x y)  => (S(RA|R|L)D x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [64]))))
-((Rsh32|Rsh32U|Lsh32)x32 x y)  => (S(RA|R|L)W x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [32]))))
+((Rsh16U|Lsh16)x64 <t> x y) => (ISEL [0] (S(R|L)D  <t> (MOVHZreg x) y) (MOVDconst [0])                   (CMPUconst  y [16]))
+((Rsh16U|Lsh16)x32 <t> x y) => (ISEL [0] (S(R|L)D  <t> (MOVHZreg x) y) (MOVDconst [0])                   (CMPWUconst y [16]))
+((Rsh16U|Lsh16)x16 <t> x y) => (ISEL [2] (S(R|L)D  <t> (MOVHZreg x) y) (MOVDconst [0])                   (Select1 <types.TypeFlags> (ANDCCconst [0xFFF0] y)))
+((Rsh16U|Lsh16)x8  <t> x y) => (ISEL [2] (S(R|L)D  <t> (MOVHZreg x) y) (MOVDconst [0])                   (Select1 <types.TypeFlags> (ANDCCconst [0x00F0] y)))
+(Rsh16x(64|32)     <t> x y) => (ISEL [0] (SRAD     <t> (MOVHreg  x) y) (SRADconst <t>  (MOVHreg x) [15]) (CMP(U|WU)const  y [16]))
+(Rsh16x16          <t> x y) => (ISEL [2] (SRAD     <t> (MOVHreg  x) y) (SRADconst <t>  (MOVHreg x) [15]) (Select1 <types.TypeFlags> (ANDCCconst [0xFFF0] y)))
+(Rsh16x8           <t> x y) => (ISEL [2] (SRAD     <t> (MOVHreg  x) y) (SRADconst <t>  (MOVHreg x) [15]) (Select1 <types.TypeFlags> (ANDCCconst [0x00F0] y)))

-(Rsh(16|16U)x32 x y)  => (SR(AW|W) ((Sign|Zero)Ext16to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [16]))))
-(Lsh16x32 x y)  => (SLW  x                 (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [16]))))
+((Rsh8U|Lsh8)x64 <t> x y) => (ISEL [0] (S(R|L)D  <t> (MOVBZreg x) y) (MOVDconst [0])                  (CMPUconst  y [8]))
+((Rsh8U|Lsh8)x32 <t> x y) => (ISEL [0] (S(R|L)D  <t> (MOVBZreg x) y) (MOVDconst [0])                  (CMPWUconst y [8]))
+((Rsh8U|Lsh8)x16 <t> x y) => (ISEL [2] (S(R|L)D  <t> (MOVBZreg x) y) (MOVDconst [0])                  (Select1 <types.TypeFlags> (ANDCCconst [0xFFF8] y)))
+((Rsh8U|Lsh8)x8  <t> x y) => (ISEL [2] (S(R|L)D  <t> (MOVBZreg x) y) (MOVDconst [0])                  (Select1 <types.TypeFlags> (ANDCCconst [0x00F8] y)))
+(Rsh8x(64|32)    <t> x y) => (ISEL [0] (SRAD     <t> (MOVBreg  x) y) (SRADconst <t>  (MOVBreg x) [7]) (CMP(U|WU)const  y [8]))
+(Rsh8x16         <t> x y) => (ISEL [2] (SRAD     <t> (MOVBreg  x) y) (SRADconst <t>  (MOVBreg x) [7]) (Select1 <types.TypeFlags> (ANDCCconst [0xFFF8] y)))
+(Rsh8x8          <t> x y) => (ISEL [2] (SRAD     <t> (MOVBreg  x) y) (SRADconst <t>  (MOVBreg x) [7]) (Select1 <types.TypeFlags> (ANDCCconst [0x00F8] y)))

-(Rsh(8|8U)x32 x y)  => (SR(AW|W) ((Sign|Zero)Ext8to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [8]))))
-(Lsh8x32 x y)  => (SLW  x                (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [8]))))
+// Catch bounded shifts in situations like foo<<uint(shift&63) which might not be caught by the prove pass.
+(CMP(U|WU)const [d] (Select0 (ANDCCconst z [c]))) && uint64(d) > uint64(c) => (FlagLT)

-((Rsh64|Rsh64U|Lsh64)x16 x y)  => (S(RA|R|L)D x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [64]))))
-
-((Rsh32|Rsh32U|Lsh32)x16 x y)  => (S(RA|R|L)W x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [32]))))
-
-(Rsh(16|16U)x16 x y)  => (S(RA|R)W ((Sign|Zero)Ext16to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [16]))))
-(Lsh16x16 x y)  => (SLW  x                 (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [16]))))
-
-(Rsh(8|8U)x16 x y)  => (SR(AW|W) ((Sign|Zero)Ext8to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [8]))))
-(Lsh8x16 x y)  => (SLW  x                (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [8]))))
-
-
-((Rsh64|Rsh64U|Lsh64)x8 x y)  => (S(RA|R|L)D x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [64]))))
-
-((Rsh32|Rsh32U|Lsh32)x8 x y)  => (S(RA|R|L)W x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [32]))))
-
-(Rsh(16|16U)x8 x y)  => (S(RA|R)W ((Sign|Zero)Ext16to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [16]))))
-(Lsh16x8 x y)  => (SLW  x                 (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [16]))))
-
-(Rsh(8|8U)x8 x y)  => (S(RA|R)W ((Sign|Zero)Ext8to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [8]))))
-(Lsh8x8 x y)  => (SLW  x                (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [8]))))
-
-// Cleaning up shift ops
-(ISEL [0] (Select0 (ANDCCconst [d] y)) (MOVDconst [-1]) (CMPU (Select0 (ANDCCconst [d] y)) (MOVDconst [c]))) && c >= d => (Select0 (ANDCCconst [d] y))
-(ISEL [0] (Select0 (ANDCCconst [d] y)) (MOVDconst [-1]) (CMPUconst [c] (Select0 (ANDCCconst [d] y)))) && c >= d => (Select0 (ANDCCconst [d] y))
 (ORN x (MOVDconst [-1])) => x

 (S(RAD|RD|LD) x (MOVDconst [c])) => (S(RAD|RD|LD)const [c&63 | (c>>6&1*63)] x)
@ -563,11 +562,12 @@
 (OR x (MOVDconst [c])) && isU32Bit(c) => (ORconst [c] x)

 // Simplify consts
-(Select0 (ANDCCconst [c] (Select0 (ANDCCconst [d] x)))) => (Select0 (ANDCCconst [c&d] x))
+(ANDCCconst [c] (Select0 (ANDCCconst [d] x))) => (ANDCCconst [c&d] x)
 (ORconst [c] (ORconst [d] x)) => (ORconst [c|d] x)
 (XORconst [c] (XORconst [d] x)) => (XORconst [c^d] x)
 (Select0 (ANDCCconst [-1] x)) => x
 (Select0 (ANDCCconst [0] _)) => (MOVDconst [0])
+(Select1 (ANDCCconst [0] _)) => (FlagEQ)
 (XORconst [0] x) => x
 (ORconst [-1] _) => (MOVDconst [-1])
 (ORconst [0] x) => x
--- a/src/cmd/compile/internal/ssa/rewritePPC64.go
+++ b/src/cmd/compile/internal/ssa/rewritePPC64.go
--- a/src/cmd/compile/internal/test/shift_test.go
+++ b/src/cmd/compile/internal/test/shift_test.go
@ -1061,3 +1061,92 @@ func TestIncorrectRotate(t *testing.T) {
 		t.Errorf("got %x want 0", got)
 	}
 }
+
+//go:noinline
+func variableShiftOverflow64x8(x int64, y, z uint8) (a, b, c int64) {
+	// Verify junk bits are ignored when doing a variable shift.
+	return x >> (y + z), x << (y + z), int64(uint64(x) >> (y + z))
+}
+
+//go:noinline
+func variableShiftOverflow32x8(x int32, y, z uint8) (a, b, c int32) {
+	// Verify junk bits are ignored when doing a variable shift.
+	return x >> (y + z), x << (y + z), int32(uint32(x) >> (y + z))
+}
+
+//go:noinline
+func variableShiftOverflow16x8(x int16, y, z uint8) (a, b, c int16) {
+	// Verify junk bits are ignored when doing a variable shift.
+	return x >> (y + z), x << (y + z), int16(uint16(x) >> (y + z))
+}
+
+//go:noinline
+func variableShiftOverflow8x8(x int8, y, z uint8) (a, b, c int8) {
+	// Verify junk bits are ignored when doing a variable shift.
+	return x >> (y + z), x << (y + z), int8(uint8(x) >> (y + z))
+}
+
+//go:noinline
+func variableShiftOverflow64x16(x int64, y, z uint16) (a, b, c int64) {
+	// Verify junk bits are ignored when doing a variable shift.
+	return x >> (y + z), x << (y + z), int64(uint64(x) >> (y + z))
+}
+
+//go:noinline
+func variableShiftOverflow32x16(x int32, y, z uint16) (a, b, c int32) {
+	// Verify junk bits are ignored when doing a variable shift.
+	return x >> (y + z), x << (y + z), int32(uint32(x) >> (y + z))
+}
+
+//go:noinline
+func variableShiftOverflow16x16(x int16, y, z uint16) (a, b, c int16) {
+	// Verify junk bits are ignored when doing a variable shift.
+	return x >> (y + z), x << (y + z), int16(uint16(x) >> (y + z))
+}
+
+//go:noinline
+func variableShiftOverflow8x16(x int8, y, z uint16) (a, b, c int8) {
+	// Verify junk bits are ignored when doing a variable shift.
+	return x >> (y + z), x << (y + z), int8(uint8(x) >> (y + z))
+}
+
+//go:noinline
+func makeU8(x uint64) uint8 {
+	// Ensure the upper portions of the register are clear before testing large shift values
+	// using non-native types (e.g uint8 on PPC64).
+	return uint8(x)
+}
+
+//go:noinline
+func makeU16(x uint64) uint16 {
+	// Ensure the upper portions of the register are clear before testing large shift values
+	// using non-native types (e.g uint8 on PPC64).
+	return uint16(x)
+}
+
+func TestShiftOverflow(t *testing.T) {
+	if v, w, z := variableShiftOverflow64x8(-64, makeU8(255), 2); v != -32 || w != -128 || z != 0x7fffffffffffffe0 {
+		t.Errorf("got %d %d 0x%x, expected -32 -128 0x7fffffffffffffe0", v, w, z)
+	}
+	if v, w, z := variableShiftOverflow32x8(-64, makeU8(255), 2); v != -32 || w != -128 || z != 0x7fffffe0 {
+		t.Errorf("got %d %d 0x%x, expected -32 -128 0x7fffffe0", v, w, z)
+	}
+	if v, w, z := variableShiftOverflow16x8(-64, makeU8(255), 2); v != -32 || w != -128 || z != 0x7fe0 {
+		t.Errorf("got %d %d 0x%x, expected -32 -128 0x7fe0", v, w, z)
+	}
+	if v, w, z := variableShiftOverflow8x8(-64, makeU8(255), 2); v != -32 || w != -128 || z != 0x60 {
+		t.Errorf("got %d %d 0x%x, expected -32 -128 0x60", v, w, z)
+	}
+	if v, w, z := variableShiftOverflow64x16(-64, makeU16(0xffff), 2); v != -32 || w != -128 || z != 0x7fffffffffffffe0 {
+		t.Errorf("got %d %d 0x%x, expected -32 -128 0x7fffffffffffffe0", v, w, z)
+	}
+	if v, w, z := variableShiftOverflow32x16(-64, makeU16(0xffff), 2); v != -32 || w != -128 || z != 0x7fffffe0 {
+		t.Errorf("got %d %d 0x%x, expected -32 -128 0x7fffffe0,", v, w, z)
+	}
+	if v, w, z := variableShiftOverflow16x16(-64, makeU16(0xffff), 2); v != -32 || w != -128 || z != 0x7fe0 {
+		t.Errorf("got %d %d 0x%x, expected -32 -128 0x7fe0", v, w, z)
+	}
+	if v, w, z := variableShiftOverflow8x16(-64, makeU16(0xffff), 2); v != -32 || w != -128 || z != 0x60 {
+		t.Errorf("got %d %d 0x%x, expected -32 -128 0x60", v, w, z)
+	}
+}