mirror of
https://github.com/golang/go
synced 2024-11-23 10:10:02 -07:00
cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x
This CL implements the math/bits.OnesCount{8,16,32,64} functions as intrinsics on s390x using the 'population count' (popcnt) instruction. This instruction was released as the 'population-count' facility which uses the same facility bit (45) as the 'distinct-operands' facility which is a pre-requisite for Go on s390x. We can therefore use it without a feature check. The s390x popcnt instruction treats a 64 bit register as a vector of 8 bytes, summing the number of ones in each byte individually. It then writes the results to the corresponding bytes in the output register. Therefore to implement OnesCount{16,32,64} we need to sum the individual byte counts using some extra instructions. To do this efficiently I've added some additional pseudo operations to the s390x SSA backend. Unlike other architectures the new instruction sequence is faster for OnesCount8, so that is implemented using the intrinsic. name old time/op new time/op delta OnesCount 3.21ns ± 1% 1.35ns ± 0% -58.00% (p=0.000 n=20+20) OnesCount8 0.91ns ± 1% 0.81ns ± 0% -11.43% (p=0.000 n=20+20) OnesCount16 1.51ns ± 3% 1.21ns ± 0% -19.71% (p=0.000 n=20+17) OnesCount32 1.91ns ± 0% 1.12ns ± 1% -41.60% (p=0.000 n=19+20) OnesCount64 3.18ns ± 4% 1.35ns ± 0% -57.52% (p=0.000 n=20+20) Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0 Reviewed-on: https://go-review.googlesource.com/114675 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
This commit is contained in:
parent
ff468a43be
commit
6f9b94ab66
1
src/cmd/asm/internal/asm/testdata/s390x.s
vendored
1
src/cmd/asm/internal/asm/testdata/s390x.s
vendored
@ -115,6 +115,7 @@ TEXT main·foo(SB),DUPOK|NOSPLIT,$16-0 // TEXT main.foo(SB), DUPOK|NOSPLIT, $16-
|
||||
NEGW R1 // b9130011
|
||||
NEGW R1, R2 // b9130021
|
||||
FLOGR R2, R2 // b9830022
|
||||
POPCNT R3, R4 // b9e10043
|
||||
|
||||
AND R1, R2 // b9800021
|
||||
AND R1, R2, R3 // b9e42031
|
||||
|
@ -3410,7 +3410,7 @@ func init() {
|
||||
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
|
||||
return s.newValue1(ssa.OpPopCount64, types.Types[TINT], args[0])
|
||||
},
|
||||
sys.PPC64, sys.ARM64)
|
||||
sys.PPC64, sys.ARM64, sys.S390X)
|
||||
addF("math/bits", "OnesCount32",
|
||||
makeOnesCountAMD64(ssa.OpPopCount32, ssa.OpPopCount32),
|
||||
sys.AMD64)
|
||||
@ -3418,7 +3418,7 @@ func init() {
|
||||
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
|
||||
return s.newValue1(ssa.OpPopCount32, types.Types[TINT], args[0])
|
||||
},
|
||||
sys.PPC64, sys.ARM64)
|
||||
sys.PPC64, sys.ARM64, sys.S390X)
|
||||
addF("math/bits", "OnesCount16",
|
||||
makeOnesCountAMD64(ssa.OpPopCount16, ssa.OpPopCount16),
|
||||
sys.AMD64)
|
||||
@ -3426,8 +3426,12 @@ func init() {
|
||||
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
|
||||
return s.newValue1(ssa.OpPopCount16, types.Types[TINT], args[0])
|
||||
},
|
||||
sys.ARM64)
|
||||
// Note: no OnesCount8, the Go implementation is faster - just a table load.
|
||||
sys.ARM64, sys.S390X)
|
||||
addF("math/bits", "OnesCount8",
|
||||
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
|
||||
return s.newValue1(ssa.OpPopCount8, types.Types[TINT], args[0])
|
||||
},
|
||||
sys.S390X)
|
||||
addF("math/bits", "OnesCount",
|
||||
makeOnesCountAMD64(ssa.OpPopCount64, ssa.OpPopCount32),
|
||||
sys.AMD64)
|
||||
|
@ -513,7 +513,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
|
||||
p.To.Type = obj.TYPE_MEM
|
||||
p.To.Name = obj.NAME_EXTERN
|
||||
p.To.Sym = v.Aux.(*obj.LSym)
|
||||
case ssa.OpS390XFLOGR, ssa.OpS390XNEG, ssa.OpS390XNEGW,
|
||||
case ssa.OpS390XFLOGR, ssa.OpS390XPOPCNT,
|
||||
ssa.OpS390XNEG, ssa.OpS390XNEGW,
|
||||
ssa.OpS390XMOVWBR, ssa.OpS390XMOVDBR:
|
||||
p := s.Prog(v.Op.Asm())
|
||||
p.From.Type = obj.TYPE_REG
|
||||
@ -522,6 +523,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
|
||||
p.To.Reg = v.Reg()
|
||||
case ssa.OpS390XNOT, ssa.OpS390XNOTW:
|
||||
v.Fatalf("NOT/NOTW generated %s", v.LongString())
|
||||
case ssa.OpS390XSumBytes2, ssa.OpS390XSumBytes4, ssa.OpS390XSumBytes8:
|
||||
v.Fatalf("SumBytes generated %s", v.LongString())
|
||||
case ssa.OpS390XMOVDEQ, ssa.OpS390XMOVDNE,
|
||||
ssa.OpS390XMOVDLT, ssa.OpS390XMOVDLE,
|
||||
ssa.OpS390XMOVDGT, ssa.OpS390XMOVDGE,
|
||||
|
@ -88,6 +88,34 @@
|
||||
|
||||
(BitLen64 x) -> (SUB (MOVDconst [64]) (FLOGR x))
|
||||
|
||||
// POPCNT treats the input register as a vector of 8 bytes, producing
|
||||
// a population count for each individual byte. For inputs larger than
|
||||
// a single byte we therefore need to sum the individual bytes produced
|
||||
// by the POPCNT instruction. For example, the following instruction
|
||||
// sequence could be used to calculate the population count of a 4-byte
|
||||
// value:
|
||||
//
|
||||
// MOVD $0x12345678, R1 // R1=0x12345678 <-- input
|
||||
// POPCNT R1, R2 // R2=0x02030404
|
||||
// SRW $16, R2, R3 // R3=0x00000203
|
||||
// ADDW R2, R3, R4 // R4=0x02030607
|
||||
// SRW $8, R4, R5 // R5=0x00020306
|
||||
// ADDW R4, R5, R6 // R6=0x0205090d
|
||||
// MOVBZ R6, R7 // R7=0x0000000d <-- result is 13
|
||||
//
|
||||
(PopCount8 x) -> (POPCNT (MOVBZreg x))
|
||||
(PopCount16 x) -> (MOVBZreg (SumBytes2 (POPCNT <typ.UInt16> x)))
|
||||
(PopCount32 x) -> (MOVBZreg (SumBytes4 (POPCNT <typ.UInt32> x)))
|
||||
(PopCount64 x) -> (MOVBZreg (SumBytes8 (POPCNT <typ.UInt64> x)))
|
||||
|
||||
// SumBytes{2,4,8} pseudo operations sum the values of the rightmost
|
||||
// 2, 4 or 8 bytes respectively. The result is a single byte however
|
||||
// other bytes might contain junk so a zero extension is required if
|
||||
// the desired output type is larger than 1 byte.
|
||||
(SumBytes2 x) -> (ADDW (SRWconst <typ.UInt8> x [8]) x)
|
||||
(SumBytes4 x) -> (SumBytes2 (ADDW <typ.UInt16> (SRWconst <typ.UInt16> x [16]) x))
|
||||
(SumBytes8 x) -> (SumBytes4 (ADDW <typ.UInt32> (SRDconst <typ.UInt32> x [32]) x))
|
||||
|
||||
(Bswap64 x) -> (MOVDBR x)
|
||||
(Bswap32 x) -> (MOVWBR x)
|
||||
|
||||
|
@ -530,6 +530,25 @@ func init() {
|
||||
clobberFlags: true,
|
||||
},
|
||||
|
||||
// population count
|
||||
//
|
||||
// Counts the number of ones in each byte of arg0
|
||||
// and places the result into the corresponding byte
|
||||
// of the result.
|
||||
{
|
||||
name: "POPCNT",
|
||||
argLength: 1,
|
||||
reg: gp11,
|
||||
asm: "POPCNT",
|
||||
typ: "UInt64",
|
||||
clobberFlags: true,
|
||||
},
|
||||
|
||||
// pseudo operations to sum the output of the POPCNT instruction
|
||||
{name: "SumBytes2", argLength: 1, typ: "UInt8"}, // sum the rightmost 2 bytes in arg0 ignoring overflow
|
||||
{name: "SumBytes4", argLength: 1, typ: "UInt8"}, // sum the rightmost 4 bytes in arg0 ignoring overflow
|
||||
{name: "SumBytes8", argLength: 1, typ: "UInt8"}, // sum all the bytes in arg0 ignoring overflow
|
||||
|
||||
// store multiple
|
||||
{
|
||||
name: "STMG2",
|
||||
|
@ -1898,6 +1898,10 @@ const (
|
||||
OpS390XLoweredAtomicExchange32
|
||||
OpS390XLoweredAtomicExchange64
|
||||
OpS390XFLOGR
|
||||
OpS390XPOPCNT
|
||||
OpS390XSumBytes2
|
||||
OpS390XSumBytes4
|
||||
OpS390XSumBytes8
|
||||
OpS390XSTMG2
|
||||
OpS390XSTMG3
|
||||
OpS390XSTMG4
|
||||
@ -25473,6 +25477,35 @@ var opcodeTable = [...]opInfo{
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "POPCNT",
|
||||
argLen: 1,
|
||||
clobberFlags: true,
|
||||
asm: s390x.APOPCNT,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "SumBytes2",
|
||||
argLen: 1,
|
||||
reg: regInfo{},
|
||||
},
|
||||
{
|
||||
name: "SumBytes4",
|
||||
argLen: 1,
|
||||
reg: regInfo{},
|
||||
},
|
||||
{
|
||||
name: "SumBytes8",
|
||||
argLen: 1,
|
||||
reg: regInfo{},
|
||||
},
|
||||
{
|
||||
name: "STMG2",
|
||||
auxType: auxSymOff,
|
||||
|
@ -383,6 +383,14 @@ func rewriteValueS390X(v *Value) bool {
|
||||
return rewriteValueS390X_OpOr8_0(v)
|
||||
case OpOrB:
|
||||
return rewriteValueS390X_OpOrB_0(v)
|
||||
case OpPopCount16:
|
||||
return rewriteValueS390X_OpPopCount16_0(v)
|
||||
case OpPopCount32:
|
||||
return rewriteValueS390X_OpPopCount32_0(v)
|
||||
case OpPopCount64:
|
||||
return rewriteValueS390X_OpPopCount64_0(v)
|
||||
case OpPopCount8:
|
||||
return rewriteValueS390X_OpPopCount8_0(v)
|
||||
case OpRound:
|
||||
return rewriteValueS390X_OpRound_0(v)
|
||||
case OpRound32F:
|
||||
@ -691,6 +699,12 @@ func rewriteValueS390X(v *Value) bool {
|
||||
return rewriteValueS390X_OpS390XSUBconst_0(v)
|
||||
case OpS390XSUBload:
|
||||
return rewriteValueS390X_OpS390XSUBload_0(v)
|
||||
case OpS390XSumBytes2:
|
||||
return rewriteValueS390X_OpS390XSumBytes2_0(v)
|
||||
case OpS390XSumBytes4:
|
||||
return rewriteValueS390X_OpS390XSumBytes4_0(v)
|
||||
case OpS390XSumBytes8:
|
||||
return rewriteValueS390X_OpS390XSumBytes8_0(v)
|
||||
case OpS390XXOR:
|
||||
return rewriteValueS390X_OpS390XXOR_0(v) || rewriteValueS390X_OpS390XXOR_10(v)
|
||||
case OpS390XXORW:
|
||||
@ -5311,6 +5325,80 @@ func rewriteValueS390X_OpOrB_0(v *Value) bool {
|
||||
return true
|
||||
}
|
||||
}
|
||||
func rewriteValueS390X_OpPopCount16_0(v *Value) bool {
|
||||
b := v.Block
|
||||
_ = b
|
||||
typ := &b.Func.Config.Types
|
||||
_ = typ
|
||||
// match: (PopCount16 x)
|
||||
// cond:
|
||||
// result: (MOVBZreg (SumBytes2 (POPCNT <typ.UInt16> x)))
|
||||
for {
|
||||
x := v.Args[0]
|
||||
v.reset(OpS390XMOVBZreg)
|
||||
v0 := b.NewValue0(v.Pos, OpS390XSumBytes2, typ.UInt8)
|
||||
v1 := b.NewValue0(v.Pos, OpS390XPOPCNT, typ.UInt16)
|
||||
v1.AddArg(x)
|
||||
v0.AddArg(v1)
|
||||
v.AddArg(v0)
|
||||
return true
|
||||
}
|
||||
}
|
||||
func rewriteValueS390X_OpPopCount32_0(v *Value) bool {
|
||||
b := v.Block
|
||||
_ = b
|
||||
typ := &b.Func.Config.Types
|
||||
_ = typ
|
||||
// match: (PopCount32 x)
|
||||
// cond:
|
||||
// result: (MOVBZreg (SumBytes4 (POPCNT <typ.UInt32> x)))
|
||||
for {
|
||||
x := v.Args[0]
|
||||
v.reset(OpS390XMOVBZreg)
|
||||
v0 := b.NewValue0(v.Pos, OpS390XSumBytes4, typ.UInt8)
|
||||
v1 := b.NewValue0(v.Pos, OpS390XPOPCNT, typ.UInt32)
|
||||
v1.AddArg(x)
|
||||
v0.AddArg(v1)
|
||||
v.AddArg(v0)
|
||||
return true
|
||||
}
|
||||
}
|
||||
func rewriteValueS390X_OpPopCount64_0(v *Value) bool {
|
||||
b := v.Block
|
||||
_ = b
|
||||
typ := &b.Func.Config.Types
|
||||
_ = typ
|
||||
// match: (PopCount64 x)
|
||||
// cond:
|
||||
// result: (MOVBZreg (SumBytes8 (POPCNT <typ.UInt64> x)))
|
||||
for {
|
||||
x := v.Args[0]
|
||||
v.reset(OpS390XMOVBZreg)
|
||||
v0 := b.NewValue0(v.Pos, OpS390XSumBytes8, typ.UInt8)
|
||||
v1 := b.NewValue0(v.Pos, OpS390XPOPCNT, typ.UInt64)
|
||||
v1.AddArg(x)
|
||||
v0.AddArg(v1)
|
||||
v.AddArg(v0)
|
||||
return true
|
||||
}
|
||||
}
|
||||
func rewriteValueS390X_OpPopCount8_0(v *Value) bool {
|
||||
b := v.Block
|
||||
_ = b
|
||||
typ := &b.Func.Config.Types
|
||||
_ = typ
|
||||
// match: (PopCount8 x)
|
||||
// cond:
|
||||
// result: (POPCNT (MOVBZreg x))
|
||||
for {
|
||||
x := v.Args[0]
|
||||
v.reset(OpS390XPOPCNT)
|
||||
v0 := b.NewValue0(v.Pos, OpS390XMOVBZreg, typ.UInt64)
|
||||
v0.AddArg(x)
|
||||
v.AddArg(v0)
|
||||
return true
|
||||
}
|
||||
}
|
||||
func rewriteValueS390X_OpRound_0(v *Value) bool {
|
||||
// match: (Round x)
|
||||
// cond:
|
||||
@ -40417,6 +40505,67 @@ func rewriteValueS390X_OpS390XSUBload_0(v *Value) bool {
|
||||
}
|
||||
return false
|
||||
}
|
||||
func rewriteValueS390X_OpS390XSumBytes2_0(v *Value) bool {
|
||||
b := v.Block
|
||||
_ = b
|
||||
typ := &b.Func.Config.Types
|
||||
_ = typ
|
||||
// match: (SumBytes2 x)
|
||||
// cond:
|
||||
// result: (ADDW (SRWconst <typ.UInt8> x [8]) x)
|
||||
for {
|
||||
x := v.Args[0]
|
||||
v.reset(OpS390XADDW)
|
||||
v0 := b.NewValue0(v.Pos, OpS390XSRWconst, typ.UInt8)
|
||||
v0.AuxInt = 8
|
||||
v0.AddArg(x)
|
||||
v.AddArg(v0)
|
||||
v.AddArg(x)
|
||||
return true
|
||||
}
|
||||
}
|
||||
func rewriteValueS390X_OpS390XSumBytes4_0(v *Value) bool {
|
||||
b := v.Block
|
||||
_ = b
|
||||
typ := &b.Func.Config.Types
|
||||
_ = typ
|
||||
// match: (SumBytes4 x)
|
||||
// cond:
|
||||
// result: (SumBytes2 (ADDW <typ.UInt16> (SRWconst <typ.UInt16> x [16]) x))
|
||||
for {
|
||||
x := v.Args[0]
|
||||
v.reset(OpS390XSumBytes2)
|
||||
v0 := b.NewValue0(v.Pos, OpS390XADDW, typ.UInt16)
|
||||
v1 := b.NewValue0(v.Pos, OpS390XSRWconst, typ.UInt16)
|
||||
v1.AuxInt = 16
|
||||
v1.AddArg(x)
|
||||
v0.AddArg(v1)
|
||||
v0.AddArg(x)
|
||||
v.AddArg(v0)
|
||||
return true
|
||||
}
|
||||
}
|
||||
func rewriteValueS390X_OpS390XSumBytes8_0(v *Value) bool {
|
||||
b := v.Block
|
||||
_ = b
|
||||
typ := &b.Func.Config.Types
|
||||
_ = typ
|
||||
// match: (SumBytes8 x)
|
||||
// cond:
|
||||
// result: (SumBytes4 (ADDW <typ.UInt32> (SRDconst <typ.UInt32> x [32]) x))
|
||||
for {
|
||||
x := v.Args[0]
|
||||
v.reset(OpS390XSumBytes4)
|
||||
v0 := b.NewValue0(v.Pos, OpS390XADDW, typ.UInt32)
|
||||
v1 := b.NewValue0(v.Pos, OpS390XSRDconst, typ.UInt32)
|
||||
v1.AuxInt = 32
|
||||
v1.AddArg(x)
|
||||
v0.AddArg(v1)
|
||||
v0.AddArg(x)
|
||||
v.AddArg(v0)
|
||||
return true
|
||||
}
|
||||
}
|
||||
func rewriteValueS390X_OpS390XXOR_0(v *Value) bool {
|
||||
// match: (XOR x (MOVDconst [c]))
|
||||
// cond: isU32Bit(c)
|
||||
|
@ -271,6 +271,9 @@ const (
|
||||
// find leftmost one
|
||||
AFLOGR
|
||||
|
||||
// population count
|
||||
APOPCNT
|
||||
|
||||
// integer bitwise
|
||||
AAND
|
||||
AANDW
|
||||
|
@ -45,6 +45,7 @@ var Anames = []string{
|
||||
"MOVDLT",
|
||||
"MOVDNE",
|
||||
"FLOGR",
|
||||
"POPCNT",
|
||||
"AND",
|
||||
"ANDW",
|
||||
"OR",
|
||||
|
@ -246,6 +246,9 @@ var optab = []Optab{
|
||||
// find leftmost one
|
||||
Optab{AFLOGR, C_REG, C_NONE, C_NONE, C_REG, 8, 0},
|
||||
|
||||
// population count
|
||||
Optab{APOPCNT, C_REG, C_NONE, C_NONE, C_REG, 9, 0},
|
||||
|
||||
// compare
|
||||
Optab{ACMP, C_REG, C_NONE, C_NONE, C_REG, 70, 0},
|
||||
Optab{ACMP, C_REG, C_NONE, C_NONE, C_LCON, 71, 0},
|
||||
@ -2849,6 +2852,9 @@ func (c *ctxtz) asmout(p *obj.Prog, asm *[]byte) {
|
||||
// FLOGR also writes a mask to p.To.Reg+1.
|
||||
zRRE(op_FLOGR, uint32(p.To.Reg), uint32(p.From.Reg), asm)
|
||||
|
||||
case 9: // population count
|
||||
zRRE(op_POPCNT, uint32(p.To.Reg), uint32(p.From.Reg), asm)
|
||||
|
||||
case 10: // subtract reg [reg] reg
|
||||
r := int(p.Reg)
|
||||
|
||||
|
@ -103,27 +103,36 @@ func Len8(n uint8) int {
|
||||
func OnesCount(n uint) int {
|
||||
// amd64:"POPCNTQ",".*support_popcnt"
|
||||
// arm64:"VCNT","VUADDLV"
|
||||
// s390x:"POPCNT"
|
||||
return bits.OnesCount(n)
|
||||
}
|
||||
|
||||
func OnesCount64(n uint64) int {
|
||||
// amd64:"POPCNTQ",".*support_popcnt"
|
||||
// arm64:"VCNT","VUADDLV"
|
||||
// s390x:"POPCNT"
|
||||
return bits.OnesCount64(n)
|
||||
}
|
||||
|
||||
func OnesCount32(n uint32) int {
|
||||
// amd64:"POPCNTL",".*support_popcnt"
|
||||
// arm64:"VCNT","VUADDLV"
|
||||
// s390x:"POPCNT"
|
||||
return bits.OnesCount32(n)
|
||||
}
|
||||
|
||||
func OnesCount16(n uint16) int {
|
||||
// amd64:"POPCNTL",".*support_popcnt"
|
||||
// arm64:"VCNT","VUADDLV"
|
||||
// s390x:"POPCNT"
|
||||
return bits.OnesCount16(n)
|
||||
}
|
||||
|
||||
func OnesCount8(n uint8) int {
|
||||
// s390x:"POPCNT"
|
||||
return bits.OnesCount8(n)
|
||||
}
|
||||
|
||||
// ----------------------- //
|
||||
// bits.ReverseBytes //
|
||||
// ----------------------- //
|
||||
|
Loading…
Reference in New Issue
Block a user