mirror of
https://github.com/golang/go
synced 2024-11-11 20:50:23 -07:00
cmd/compile: intrinsify math/bits/ReverseBytes{16|32|64} for ppc64/power10
This change intrinsifies ReverseBytes{16|32|64} by generating the corresponding new instructions in Power10: brh, brd and brw and adds a verification test for the same. On Power 9 and 8, the .go code performs optimally as it is. Performance improvement seen on Power10: ReverseBytes32 1.38ns ± 0% 1.18ns ± 0% -14.2 ReverseBytes64 1.52ns ± 0% 1.11ns ± 0% -26.87 ReverseBytes16 1.41ns ± 1% 1.18ns ± 0% -16.47 Change-Id: I88f127f3ab9ba24a772becc21ad90acfba324b37 Reviewed-on: https://go-review.googlesource.com/c/go/+/446675 Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> Reviewed-by: Michael Knyszek <mknyszek@google.com>
This commit is contained in:
parent
a96487613e
commit
cd1fc87156
@ -670,7 +670,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
|
||||
case ssa.OpPPC64NEG, ssa.OpPPC64FNEG, ssa.OpPPC64FSQRT, ssa.OpPPC64FSQRTS, ssa.OpPPC64FFLOOR, ssa.OpPPC64FTRUNC, ssa.OpPPC64FCEIL,
|
||||
ssa.OpPPC64FCTIDZ, ssa.OpPPC64FCTIWZ, ssa.OpPPC64FCFID, ssa.OpPPC64FCFIDS, ssa.OpPPC64FRSP, ssa.OpPPC64CNTLZD, ssa.OpPPC64CNTLZW,
|
||||
ssa.OpPPC64POPCNTD, ssa.OpPPC64POPCNTW, ssa.OpPPC64POPCNTB, ssa.OpPPC64MFVSRD, ssa.OpPPC64MTVSRD, ssa.OpPPC64FABS, ssa.OpPPC64FNABS,
|
||||
ssa.OpPPC64FROUND, ssa.OpPPC64CNTTZW, ssa.OpPPC64CNTTZD:
|
||||
ssa.OpPPC64FROUND, ssa.OpPPC64CNTTZW, ssa.OpPPC64CNTTZD, ssa.OpPPC64BRH, ssa.OpPPC64BRW, ssa.OpPPC64BRD:
|
||||
r := v.Reg()
|
||||
p := s.Prog(v.Op.Asm())
|
||||
p.To.Type = obj.TYPE_REG
|
||||
|
@ -1273,3 +1273,5 @@
|
||||
(PrefetchCache ptr mem) => (DCBT ptr mem [0])
|
||||
(PrefetchCacheStreamed ptr mem) => (DCBT ptr mem [16])
|
||||
|
||||
// Use byte reverse instructions on Power10
|
||||
(Bswap(16|32|64) x) && buildcfg.GOPPC64>=10 => (BR(H|W|D) x)
|
||||
|
@ -295,6 +295,9 @@ func init() {
|
||||
{name: "XORCC", argLength: 2, reg: gp21, asm: "XORCC", commutative: true, clobberFlags: true, typ: "(Int,Flags)"}, // arg0^arg1 sets CC
|
||||
{name: "EQV", argLength: 2, reg: gp21, asm: "EQV", typ: "Int64", commutative: true}, // arg0^^arg1
|
||||
{name: "NEG", argLength: 1, reg: gp11, asm: "NEG"}, // -arg0 (integer)
|
||||
{name: "BRD", argLength: 1, reg: gp11, asm: "BRD"}, // reversebytes64(arg0)
|
||||
{name: "BRW", argLength: 1, reg: gp11, asm: "BRW"}, // reversebytes32(arg0)
|
||||
{name: "BRH", argLength: 1, reg: gp11, asm: "BRH"}, // reversebytes16(arg0)
|
||||
{name: "FNEG", argLength: 1, reg: fp11, asm: "FNEG"}, // -arg0 (floating point)
|
||||
{name: "FSQRT", argLength: 1, reg: fp11, asm: "FSQRT"}, // sqrt(arg0) (floating point)
|
||||
{name: "FSQRTS", argLength: 1, reg: fp11, asm: "FSQRTS"}, // sqrt(arg0) (floating point, single precision)
|
||||
|
@ -238,6 +238,7 @@ var genericOps = []opData{
|
||||
{name: "BitLen32", argLength: 1}, // Number of bits in arg[0] (returns 0-32)
|
||||
{name: "BitLen64", argLength: 1}, // Number of bits in arg[0] (returns 0-64)
|
||||
|
||||
{name: "Bswap16", argLength: 1}, // Swap bytes
|
||||
{name: "Bswap32", argLength: 1}, // Swap bytes
|
||||
{name: "Bswap64", argLength: 1}, // Swap bytes
|
||||
|
||||
|
@ -2161,6 +2161,9 @@ const (
|
||||
OpPPC64XORCC
|
||||
OpPPC64EQV
|
||||
OpPPC64NEG
|
||||
OpPPC64BRD
|
||||
OpPPC64BRW
|
||||
OpPPC64BRH
|
||||
OpPPC64FNEG
|
||||
OpPPC64FSQRT
|
||||
OpPPC64FSQRTS
|
||||
@ -2962,6 +2965,7 @@ const (
|
||||
OpBitLen16
|
||||
OpBitLen32
|
||||
OpBitLen64
|
||||
OpBswap16
|
||||
OpBswap32
|
||||
OpBswap64
|
||||
OpBitRev8
|
||||
@ -29013,6 +29017,45 @@ var opcodeTable = [...]opInfo{
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "BRD",
|
||||
argLen: 1,
|
||||
asm: ppc64.ABRD,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "BRW",
|
||||
argLen: 1,
|
||||
asm: ppc64.ABRW,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "BRH",
|
||||
argLen: 1,
|
||||
asm: ppc64.ABRH,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "FNEG",
|
||||
argLen: 1,
|
||||
@ -38564,6 +38607,11 @@ var opcodeTable = [...]opInfo{
|
||||
argLen: 1,
|
||||
generic: true,
|
||||
},
|
||||
{
|
||||
name: "Bswap16",
|
||||
argLen: 1,
|
||||
generic: true,
|
||||
},
|
||||
{
|
||||
name: "Bswap32",
|
||||
argLen: 1,
|
||||
|
@ -107,6 +107,12 @@ func rewriteValuePPC64(v *Value) bool {
|
||||
return rewriteValuePPC64_OpBitLen32(v)
|
||||
case OpBitLen64:
|
||||
return rewriteValuePPC64_OpBitLen64(v)
|
||||
case OpBswap16:
|
||||
return rewriteValuePPC64_OpBswap16(v)
|
||||
case OpBswap32:
|
||||
return rewriteValuePPC64_OpBswap32(v)
|
||||
case OpBswap64:
|
||||
return rewriteValuePPC64_OpBswap64(v)
|
||||
case OpCeil:
|
||||
v.Op = OpPPC64FCEIL
|
||||
return true
|
||||
@ -1122,6 +1128,54 @@ func rewriteValuePPC64_OpBitLen64(v *Value) bool {
|
||||
return true
|
||||
}
|
||||
}
|
||||
func rewriteValuePPC64_OpBswap16(v *Value) bool {
|
||||
v_0 := v.Args[0]
|
||||
// match: (Bswap16 x)
|
||||
// cond: buildcfg.GOPPC64>=10
|
||||
// result: (BRH x)
|
||||
for {
|
||||
x := v_0
|
||||
if !(buildcfg.GOPPC64 >= 10) {
|
||||
break
|
||||
}
|
||||
v.reset(OpPPC64BRH)
|
||||
v.AddArg(x)
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
func rewriteValuePPC64_OpBswap32(v *Value) bool {
|
||||
v_0 := v.Args[0]
|
||||
// match: (Bswap32 x)
|
||||
// cond: buildcfg.GOPPC64>=10
|
||||
// result: (BRW x)
|
||||
for {
|
||||
x := v_0
|
||||
if !(buildcfg.GOPPC64 >= 10) {
|
||||
break
|
||||
}
|
||||
v.reset(OpPPC64BRW)
|
||||
v.AddArg(x)
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
func rewriteValuePPC64_OpBswap64(v *Value) bool {
|
||||
v_0 := v.Args[0]
|
||||
// match: (Bswap64 x)
|
||||
// cond: buildcfg.GOPPC64>=10
|
||||
// result: (BRD x)
|
||||
for {
|
||||
x := v_0
|
||||
if !(buildcfg.GOPPC64 >= 10) {
|
||||
break
|
||||
}
|
||||
v.reset(OpPPC64BRD)
|
||||
v.AddArg(x)
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
func rewriteValuePPC64_OpCom16(v *Value) bool {
|
||||
v_0 := v.Args[0]
|
||||
// match: (Com16 x)
|
||||
|
@ -4000,17 +4000,23 @@ func InitTables() {
|
||||
},
|
||||
sys.ARM64, sys.PPC64)
|
||||
|
||||
/* Use only on Power10 as the new byte reverse instructions that Power10 provide
|
||||
make it worthwhile as an intrinsic */
|
||||
brev_arch := []sys.ArchFamily{sys.AMD64, sys.ARM64, sys.ARM, sys.S390X}
|
||||
if buildcfg.GOPPC64 >= 10 {
|
||||
brev_arch = append(brev_arch, sys.PPC64)
|
||||
}
|
||||
/******** runtime/internal/sys ********/
|
||||
addF("runtime/internal/sys", "Bswap32",
|
||||
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
|
||||
return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
|
||||
},
|
||||
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X)
|
||||
brev_arch...)
|
||||
addF("runtime/internal/sys", "Bswap64",
|
||||
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
|
||||
return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
|
||||
},
|
||||
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X)
|
||||
brev_arch...)
|
||||
|
||||
/****** Prefetch ******/
|
||||
makePrefetchFunc := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
|
||||
@ -4537,7 +4543,16 @@ func InitTables() {
|
||||
alias("math/bits", "ReverseBytes64", "runtime/internal/sys", "Bswap64", all...)
|
||||
alias("math/bits", "ReverseBytes32", "runtime/internal/sys", "Bswap32", all...)
|
||||
// ReverseBytes inlines correctly, no need to intrinsify it.
|
||||
// ReverseBytes16 lowers to a rotate, no need for anything special here.
|
||||
// Nothing special is needed for targets where ReverseBytes16 lowers to a rotate
|
||||
// On Power10, 16-bit rotate is not available so use BRH instruction
|
||||
if buildcfg.GOPPC64 >= 10 {
|
||||
addF("math/bits", "ReverseBytes16",
|
||||
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
|
||||
return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT], args[0])
|
||||
},
|
||||
sys.PPC64)
|
||||
}
|
||||
|
||||
addF("math/bits", "Len64",
|
||||
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
|
||||
return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
|
||||
|
@ -198,6 +198,7 @@ func ReverseBytes64(n uint64) uint64 {
|
||||
// amd64:"BSWAPQ"
|
||||
// s390x:"MOVDBR"
|
||||
// arm64:"REV"
|
||||
// ppc64x/power10: "BRD"
|
||||
return bits.ReverseBytes64(n)
|
||||
}
|
||||
|
||||
@ -205,6 +206,7 @@ func ReverseBytes32(n uint32) uint32 {
|
||||
// amd64:"BSWAPL"
|
||||
// s390x:"MOVWBR"
|
||||
// arm64:"REVW"
|
||||
// ppc64x/power10: "BRW"
|
||||
return bits.ReverseBytes32(n)
|
||||
}
|
||||
|
||||
@ -214,6 +216,7 @@ func ReverseBytes16(n uint16) uint16 {
|
||||
// arm/5:"SLL","SRL","ORR"
|
||||
// arm/6:"REV16"
|
||||
// arm/7:"REV16"
|
||||
// ppc64x/power10: "BRH"
|
||||
return bits.ReverseBytes16(n)
|
||||
}
|
||||
|
||||
|
@ -1649,8 +1649,8 @@ var (
|
||||
"loong64": {},
|
||||
"mips": {"GOMIPS", "hardfloat", "softfloat"},
|
||||
"mips64": {"GOMIPS64", "hardfloat", "softfloat"},
|
||||
"ppc64": {"GOPPC64", "power8", "power9"},
|
||||
"ppc64le": {"GOPPC64", "power8", "power9"},
|
||||
"ppc64": {"GOPPC64", "power8", "power9", "power10"},
|
||||
"ppc64le": {"GOPPC64", "power8", "power9", "power10"},
|
||||
"ppc64x": {}, // A pseudo-arch representing both ppc64 and ppc64le
|
||||
"s390x": {},
|
||||
"wasm": {},
|
||||
|
Loading…
Reference in New Issue
Block a user