1
0
mirror of https://github.com/golang/go synced 2024-09-28 23:14:38 -06:00

cmd/compile: rework unbounded shift lowering on PPC64

This reduces unbounded shift latency by one cycle, and may
generate less instructions in some cases.

When there is a choice whether to use doubleword or word shifts, use
doubleword shifts. Doubleword shifts have fewer hardware scheduling
restrictions across P8/P9/P10.

Likewise, rework the shift sequence to allow the compare/shift/overshift
values to compute in parallel, then choose the correct value.

Some ANDCCconst rules also need reworked to ensure they simplify when
used for their flag value. This commonly occurs when prove fails to
identify a bounded shift (e.g foo32<<uint(x&31)).

Change-Id: Ifc6ff4a865d68675e57745056db414b0eb6f2d34
Reviewed-on: https://go-review.googlesource.com/c/go/+/442597
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Than McIntosh <thanm@google.com>
Run-TryBot: Paul Murphy <murp@ibm.com>
Reviewed-by: Ian Lance Taylor <iant@google.com>
This commit is contained in:
Paul E. Murphy 2022-09-27 13:13:10 -05:00 committed by Paul Murphy
parent f60a2a9c94
commit f98dd29910
3 changed files with 943 additions and 851 deletions

View File

@ -172,61 +172,60 @@
// Lower bounded shifts first. No need to check shift value.
(Lsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SLD x y)
(Lsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SLW x y)
(Lsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SLW x y)
(Lsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SLW x y)
(Lsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SLD x y)
(Lsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SLD x y)
(Rsh64Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRD x y)
(Rsh32Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRW x y)
(Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRW (MOVHZreg x) y)
(Rsh8Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRW (MOVBZreg x) y)
(Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRD (MOVHZreg x) y)
(Rsh8Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRD (MOVBZreg x) y)
(Rsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAD x y)
(Rsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAW x y)
(Rsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAW (MOVHreg x) y)
(Rsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAW (MOVBreg x) y)
(Rsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAD (MOVHreg x) y)
(Rsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAD (MOVBreg x) y)
// non-constant rotates
// If shift > 64 then use -1 as shift count to shift all bits.
((Lsh64|Rsh64|Rsh64U)x64 x y) => (S(L|RA|R)D x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [64]))))
((Rsh32|Rsh32U|Lsh32)x64 x y) => (S(RA|R|L)W x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [32]))))
// Unbounded shifts. Go shifts saturate to 0 or -1 when shifting beyond the number of
// bits in a type, PPC64 shifts do not (see the ISA for details).
//
// Note, y is always non-negative.
//
// Note, ISELZ is intentionally not used in lower. Where possible, ISEL is converted to ISELZ in late lower
// after all the ISEL folding rules have been exercised.
(Rsh(16|16U)x64 x y) => (SR(AW|W) ((Sign|Zero)Ext16to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [16]))))
(Lsh16x64 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [16]))))
((Rsh64U|Lsh64)x64 <t> x y) => (ISEL [0] (S(R|L)D <t> x y) (MOVDconst [0]) (CMPUconst y [64]))
((Rsh64U|Lsh64)x32 <t> x y) => (ISEL [0] (S(R|L)D <t> x y) (MOVDconst [0]) (CMPWUconst y [64]))
((Rsh64U|Lsh64)x16 <t> x y) => (ISEL [2] (S(R|L)D <t> x y) (MOVDconst [0]) (Select1 <types.TypeFlags> (ANDCCconst [0xFFC0] y)))
((Rsh64U|Lsh64)x8 <t> x y) => (ISEL [2] (S(R|L)D <t> x y) (MOVDconst [0]) (Select1 <types.TypeFlags> (ANDCCconst [0x00C0] y)))
(Rsh64x(64|32) <t> x y) => (ISEL [0] (SRAD <t> x y) (SRADconst <t> x [63]) (CMP(U|WU)const y [64]))
(Rsh64x16 <t> x y) => (ISEL [2] (SRAD <t> x y) (SRADconst <t> x [63]) (Select1 <types.TypeFlags> (ANDCCconst [0xFFC0] y)))
(Rsh64x8 <t> x y) => (ISEL [2] (SRAD <t> x y) (SRADconst <t> x [63]) (Select1 <types.TypeFlags> (ANDCCconst [0x00C0] y)))
(Rsh(8|8U)x64 x y) => (SR(AW|W) ((Sign|Zero)Ext8to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [8]))))
(Lsh8x64 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [8]))))
((Rsh32U|Lsh32)x64 <t> x y) => (ISEL [0] (S(R|L)W <t> x y) (MOVDconst [0]) (CMPUconst y [32]))
((Rsh32U|Lsh32)x32 <t> x y) => (ISEL [0] (S(R|L)W <t> x y) (MOVDconst [0]) (CMPWUconst y [32]))
((Rsh32U|Lsh32)x16 <t> x y) => (ISEL [2] (S(R|L)W <t> x y) (MOVDconst [0]) (Select1 <types.TypeFlags> (ANDCCconst [0xFFE0] y)))
((Rsh32U|Lsh32)x8 <t> x y) => (ISEL [2] (S(R|L)W <t> x y) (MOVDconst [0]) (Select1 <types.TypeFlags> (ANDCCconst [0x00E0] y)))
(Rsh32x(64|32) <t> x y) => (ISEL [0] (SRAW <t> x y) (SRAWconst <t> x [31]) (CMP(U|WU)const y [32]))
(Rsh32x16 <t> x y) => (ISEL [2] (SRAW <t> x y) (SRAWconst <t> x [31]) (Select1 <types.TypeFlags> (ANDCCconst [0xFFE0] y)))
(Rsh32x8 <t> x y) => (ISEL [2] (SRAW <t> x y) (SRAWconst <t> x [31]) (Select1 <types.TypeFlags> (ANDCCconst [0x00E0] y)))
((Rsh64|Rsh64U|Lsh64)x32 x y) => (S(RA|R|L)D x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [64]))))
((Rsh32|Rsh32U|Lsh32)x32 x y) => (S(RA|R|L)W x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [32]))))
((Rsh16U|Lsh16)x64 <t> x y) => (ISEL [0] (S(R|L)D <t> (MOVHZreg x) y) (MOVDconst [0]) (CMPUconst y [16]))
((Rsh16U|Lsh16)x32 <t> x y) => (ISEL [0] (S(R|L)D <t> (MOVHZreg x) y) (MOVDconst [0]) (CMPWUconst y [16]))
((Rsh16U|Lsh16)x16 <t> x y) => (ISEL [2] (S(R|L)D <t> (MOVHZreg x) y) (MOVDconst [0]) (Select1 <types.TypeFlags> (ANDCCconst [0xFFF0] y)))
((Rsh16U|Lsh16)x8 <t> x y) => (ISEL [2] (S(R|L)D <t> (MOVHZreg x) y) (MOVDconst [0]) (Select1 <types.TypeFlags> (ANDCCconst [0x00F0] y)))
(Rsh16x(64|32) <t> x y) => (ISEL [0] (SRAD <t> (MOVHreg x) y) (SRADconst <t> (MOVHreg x) [15]) (CMP(U|WU)const y [16]))
(Rsh16x16 <t> x y) => (ISEL [2] (SRAD <t> (MOVHreg x) y) (SRADconst <t> (MOVHreg x) [15]) (Select1 <types.TypeFlags> (ANDCCconst [0xFFF0] y)))
(Rsh16x8 <t> x y) => (ISEL [2] (SRAD <t> (MOVHreg x) y) (SRADconst <t> (MOVHreg x) [15]) (Select1 <types.TypeFlags> (ANDCCconst [0x00F0] y)))
(Rsh(16|16U)x32 x y) => (SR(AW|W) ((Sign|Zero)Ext16to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [16]))))
(Lsh16x32 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [16]))))
((Rsh8U|Lsh8)x64 <t> x y) => (ISEL [0] (S(R|L)D <t> (MOVBZreg x) y) (MOVDconst [0]) (CMPUconst y [8]))
((Rsh8U|Lsh8)x32 <t> x y) => (ISEL [0] (S(R|L)D <t> (MOVBZreg x) y) (MOVDconst [0]) (CMPWUconst y [8]))
((Rsh8U|Lsh8)x16 <t> x y) => (ISEL [2] (S(R|L)D <t> (MOVBZreg x) y) (MOVDconst [0]) (Select1 <types.TypeFlags> (ANDCCconst [0xFFF8] y)))
((Rsh8U|Lsh8)x8 <t> x y) => (ISEL [2] (S(R|L)D <t> (MOVBZreg x) y) (MOVDconst [0]) (Select1 <types.TypeFlags> (ANDCCconst [0x00F8] y)))
(Rsh8x(64|32) <t> x y) => (ISEL [0] (SRAD <t> (MOVBreg x) y) (SRADconst <t> (MOVBreg x) [7]) (CMP(U|WU)const y [8]))
(Rsh8x16 <t> x y) => (ISEL [2] (SRAD <t> (MOVBreg x) y) (SRADconst <t> (MOVBreg x) [7]) (Select1 <types.TypeFlags> (ANDCCconst [0xFFF8] y)))
(Rsh8x8 <t> x y) => (ISEL [2] (SRAD <t> (MOVBreg x) y) (SRADconst <t> (MOVBreg x) [7]) (Select1 <types.TypeFlags> (ANDCCconst [0x00F8] y)))
(Rsh(8|8U)x32 x y) => (SR(AW|W) ((Sign|Zero)Ext8to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [8]))))
(Lsh8x32 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [8]))))
// Catch bounded shifts in situations like foo<<uint(shift&63) which might not be caught by the prove pass.
(CMP(U|WU)const [d] (Select0 (ANDCCconst z [c]))) && uint64(d) > uint64(c) => (FlagLT)
((Rsh64|Rsh64U|Lsh64)x16 x y) => (S(RA|R|L)D x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [64]))))
((Rsh32|Rsh32U|Lsh32)x16 x y) => (S(RA|R|L)W x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [32]))))
(Rsh(16|16U)x16 x y) => (S(RA|R)W ((Sign|Zero)Ext16to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [16]))))
(Lsh16x16 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [16]))))
(Rsh(8|8U)x16 x y) => (SR(AW|W) ((Sign|Zero)Ext8to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [8]))))
(Lsh8x16 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [8]))))
((Rsh64|Rsh64U|Lsh64)x8 x y) => (S(RA|R|L)D x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [64]))))
((Rsh32|Rsh32U|Lsh32)x8 x y) => (S(RA|R|L)W x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [32]))))
(Rsh(16|16U)x8 x y) => (S(RA|R)W ((Sign|Zero)Ext16to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [16]))))
(Lsh16x8 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [16]))))
(Rsh(8|8U)x8 x y) => (S(RA|R)W ((Sign|Zero)Ext8to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [8]))))
(Lsh8x8 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [8]))))
// Cleaning up shift ops
(ISEL [0] (Select0 (ANDCCconst [d] y)) (MOVDconst [-1]) (CMPU (Select0 (ANDCCconst [d] y)) (MOVDconst [c]))) && c >= d => (Select0 (ANDCCconst [d] y))
(ISEL [0] (Select0 (ANDCCconst [d] y)) (MOVDconst [-1]) (CMPUconst [c] (Select0 (ANDCCconst [d] y)))) && c >= d => (Select0 (ANDCCconst [d] y))
(ORN x (MOVDconst [-1])) => x
(S(RAD|RD|LD) x (MOVDconst [c])) => (S(RAD|RD|LD)const [c&63 | (c>>6&1*63)] x)
@ -563,11 +562,12 @@
(OR x (MOVDconst [c])) && isU32Bit(c) => (ORconst [c] x)
// Simplify consts
(Select0 (ANDCCconst [c] (Select0 (ANDCCconst [d] x)))) => (Select0 (ANDCCconst [c&d] x))
(ANDCCconst [c] (Select0 (ANDCCconst [d] x))) => (ANDCCconst [c&d] x)
(ORconst [c] (ORconst [d] x)) => (ORconst [c|d] x)
(XORconst [c] (XORconst [d] x)) => (XORconst [c^d] x)
(Select0 (ANDCCconst [-1] x)) => x
(Select0 (ANDCCconst [0] _)) => (MOVDconst [0])
(Select1 (ANDCCconst [0] _)) => (FlagEQ)
(XORconst [0] x) => x
(ORconst [-1] _) => (MOVDconst [-1])
(ORconst [0] x) => x

File diff suppressed because it is too large Load Diff

View File

@ -1061,3 +1061,92 @@ func TestIncorrectRotate(t *testing.T) {
t.Errorf("got %x want 0", got)
}
}
//go:noinline
func variableShiftOverflow64x8(x int64, y, z uint8) (a, b, c int64) {
// Verify junk bits are ignored when doing a variable shift.
return x >> (y + z), x << (y + z), int64(uint64(x) >> (y + z))
}
//go:noinline
func variableShiftOverflow32x8(x int32, y, z uint8) (a, b, c int32) {
// Verify junk bits are ignored when doing a variable shift.
return x >> (y + z), x << (y + z), int32(uint32(x) >> (y + z))
}
//go:noinline
func variableShiftOverflow16x8(x int16, y, z uint8) (a, b, c int16) {
// Verify junk bits are ignored when doing a variable shift.
return x >> (y + z), x << (y + z), int16(uint16(x) >> (y + z))
}
//go:noinline
func variableShiftOverflow8x8(x int8, y, z uint8) (a, b, c int8) {
// Verify junk bits are ignored when doing a variable shift.
return x >> (y + z), x << (y + z), int8(uint8(x) >> (y + z))
}
//go:noinline
func variableShiftOverflow64x16(x int64, y, z uint16) (a, b, c int64) {
// Verify junk bits are ignored when doing a variable shift.
return x >> (y + z), x << (y + z), int64(uint64(x) >> (y + z))
}
//go:noinline
func variableShiftOverflow32x16(x int32, y, z uint16) (a, b, c int32) {
// Verify junk bits are ignored when doing a variable shift.
return x >> (y + z), x << (y + z), int32(uint32(x) >> (y + z))
}
//go:noinline
func variableShiftOverflow16x16(x int16, y, z uint16) (a, b, c int16) {
// Verify junk bits are ignored when doing a variable shift.
return x >> (y + z), x << (y + z), int16(uint16(x) >> (y + z))
}
//go:noinline
func variableShiftOverflow8x16(x int8, y, z uint16) (a, b, c int8) {
// Verify junk bits are ignored when doing a variable shift.
return x >> (y + z), x << (y + z), int8(uint8(x) >> (y + z))
}
//go:noinline
func makeU8(x uint64) uint8 {
// Ensure the upper portions of the register are clear before testing large shift values
// using non-native types (e.g uint8 on PPC64).
return uint8(x)
}
//go:noinline
func makeU16(x uint64) uint16 {
// Ensure the upper portions of the register are clear before testing large shift values
// using non-native types (e.g uint8 on PPC64).
return uint16(x)
}
func TestShiftOverflow(t *testing.T) {
if v, w, z := variableShiftOverflow64x8(-64, makeU8(255), 2); v != -32 || w != -128 || z != 0x7fffffffffffffe0 {
t.Errorf("got %d %d 0x%x, expected -32 -128 0x7fffffffffffffe0", v, w, z)
}
if v, w, z := variableShiftOverflow32x8(-64, makeU8(255), 2); v != -32 || w != -128 || z != 0x7fffffe0 {
t.Errorf("got %d %d 0x%x, expected -32 -128 0x7fffffe0", v, w, z)
}
if v, w, z := variableShiftOverflow16x8(-64, makeU8(255), 2); v != -32 || w != -128 || z != 0x7fe0 {
t.Errorf("got %d %d 0x%x, expected -32 -128 0x7fe0", v, w, z)
}
if v, w, z := variableShiftOverflow8x8(-64, makeU8(255), 2); v != -32 || w != -128 || z != 0x60 {
t.Errorf("got %d %d 0x%x, expected -32 -128 0x60", v, w, z)
}
if v, w, z := variableShiftOverflow64x16(-64, makeU16(0xffff), 2); v != -32 || w != -128 || z != 0x7fffffffffffffe0 {
t.Errorf("got %d %d 0x%x, expected -32 -128 0x7fffffffffffffe0", v, w, z)
}
if v, w, z := variableShiftOverflow32x16(-64, makeU16(0xffff), 2); v != -32 || w != -128 || z != 0x7fffffe0 {
t.Errorf("got %d %d 0x%x, expected -32 -128 0x7fffffe0,", v, w, z)
}
if v, w, z := variableShiftOverflow16x16(-64, makeU16(0xffff), 2); v != -32 || w != -128 || z != 0x7fe0 {
t.Errorf("got %d %d 0x%x, expected -32 -128 0x7fe0", v, w, z)
}
if v, w, z := variableShiftOverflow8x16(-64, makeU16(0xffff), 2); v != -32 || w != -128 || z != 0x60 {
t.Errorf("got %d %d 0x%x, expected -32 -128 0x60", v, w, z)
}
}