1
0
mirror of https://github.com/golang/go synced 2024-11-23 10:10:02 -07:00

cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x

This CL implements the math/bits.OnesCount{8,16,32,64} functions
as intrinsics on s390x using the 'population count' (popcnt)
instruction. This instruction was released as the 'population-count'
facility which uses the same facility bit (45) as the
'distinct-operands' facility which is a pre-requisite for Go on
s390x. We can therefore use it without a feature check.

The s390x popcnt instruction treats a 64 bit register as a vector
of 8 bytes, summing the number of ones in each byte individually.
It then writes the results to the corresponding bytes in the
output register. Therefore to implement OnesCount{16,32,64} we
need to sum the individual byte counts using some extra
instructions. To do this efficiently I've added some additional
pseudo operations to the s390x SSA backend.

Unlike other architectures the new instruction sequence is faster
for OnesCount8, so that is implemented using the intrinsic.

name         old time/op  new time/op  delta
OnesCount    3.21ns ± 1%  1.35ns ± 0%  -58.00%  (p=0.000 n=20+20)
OnesCount8   0.91ns ± 1%  0.81ns ± 0%  -11.43%  (p=0.000 n=20+20)
OnesCount16  1.51ns ± 3%  1.21ns ± 0%  -19.71%  (p=0.000 n=20+17)
OnesCount32  1.91ns ± 0%  1.12ns ± 1%  -41.60%  (p=0.000 n=19+20)
OnesCount64  3.18ns ± 4%  1.35ns ± 0%  -57.52%  (p=0.000 n=20+20)

Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0
Reviewed-on: https://go-review.googlesource.com/114675
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
This commit is contained in:
Michael Munday 2018-05-25 17:54:58 +01:00
parent ff468a43be
commit 6f9b94ab66
11 changed files with 261 additions and 5 deletions

View File

@ -115,6 +115,7 @@ TEXT main·foo(SB),DUPOK|NOSPLIT,$16-0 // TEXT main.foo(SB), DUPOK|NOSPLIT, $16-
NEGW R1 // b9130011
NEGW R1, R2 // b9130021
FLOGR R2, R2 // b9830022
POPCNT R3, R4 // b9e10043
AND R1, R2 // b9800021
AND R1, R2, R3 // b9e42031

View File

@ -3410,7 +3410,7 @@ func init() {
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpPopCount64, types.Types[TINT], args[0])
},
sys.PPC64, sys.ARM64)
sys.PPC64, sys.ARM64, sys.S390X)
addF("math/bits", "OnesCount32",
makeOnesCountAMD64(ssa.OpPopCount32, ssa.OpPopCount32),
sys.AMD64)
@ -3418,7 +3418,7 @@ func init() {
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpPopCount32, types.Types[TINT], args[0])
},
sys.PPC64, sys.ARM64)
sys.PPC64, sys.ARM64, sys.S390X)
addF("math/bits", "OnesCount16",
makeOnesCountAMD64(ssa.OpPopCount16, ssa.OpPopCount16),
sys.AMD64)
@ -3426,8 +3426,12 @@ func init() {
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpPopCount16, types.Types[TINT], args[0])
},
sys.ARM64)
// Note: no OnesCount8, the Go implementation is faster - just a table load.
sys.ARM64, sys.S390X)
addF("math/bits", "OnesCount8",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpPopCount8, types.Types[TINT], args[0])
},
sys.S390X)
addF("math/bits", "OnesCount",
makeOnesCountAMD64(ssa.OpPopCount64, ssa.OpPopCount32),
sys.AMD64)

View File

@ -513,7 +513,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.To.Type = obj.TYPE_MEM
p.To.Name = obj.NAME_EXTERN
p.To.Sym = v.Aux.(*obj.LSym)
case ssa.OpS390XFLOGR, ssa.OpS390XNEG, ssa.OpS390XNEGW,
case ssa.OpS390XFLOGR, ssa.OpS390XPOPCNT,
ssa.OpS390XNEG, ssa.OpS390XNEGW,
ssa.OpS390XMOVWBR, ssa.OpS390XMOVDBR:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
@ -522,6 +523,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.To.Reg = v.Reg()
case ssa.OpS390XNOT, ssa.OpS390XNOTW:
v.Fatalf("NOT/NOTW generated %s", v.LongString())
case ssa.OpS390XSumBytes2, ssa.OpS390XSumBytes4, ssa.OpS390XSumBytes8:
v.Fatalf("SumBytes generated %s", v.LongString())
case ssa.OpS390XMOVDEQ, ssa.OpS390XMOVDNE,
ssa.OpS390XMOVDLT, ssa.OpS390XMOVDLE,
ssa.OpS390XMOVDGT, ssa.OpS390XMOVDGE,

View File

@ -88,6 +88,34 @@
(BitLen64 x) -> (SUB (MOVDconst [64]) (FLOGR x))
// POPCNT treats the input register as a vector of 8 bytes, producing
// a population count for each individual byte. For inputs larger than
// a single byte we therefore need to sum the individual bytes produced
// by the POPCNT instruction. For example, the following instruction
// sequence could be used to calculate the population count of a 4-byte
// value:
//
// MOVD $0x12345678, R1 // R1=0x12345678 <-- input
// POPCNT R1, R2 // R2=0x02030404
// SRW $16, R2, R3 // R3=0x00000203
// ADDW R2, R3, R4 // R4=0x02030607
// SRW $8, R4, R5 // R5=0x00020306
// ADDW R4, R5, R6 // R6=0x0205090d
// MOVBZ R6, R7 // R7=0x0000000d <-- result is 13
//
(PopCount8 x) -> (POPCNT (MOVBZreg x))
(PopCount16 x) -> (MOVBZreg (SumBytes2 (POPCNT <typ.UInt16> x)))
(PopCount32 x) -> (MOVBZreg (SumBytes4 (POPCNT <typ.UInt32> x)))
(PopCount64 x) -> (MOVBZreg (SumBytes8 (POPCNT <typ.UInt64> x)))
// SumBytes{2,4,8} pseudo operations sum the values of the rightmost
// 2, 4 or 8 bytes respectively. The result is a single byte however
// other bytes might contain junk so a zero extension is required if
// the desired output type is larger than 1 byte.
(SumBytes2 x) -> (ADDW (SRWconst <typ.UInt8> x [8]) x)
(SumBytes4 x) -> (SumBytes2 (ADDW <typ.UInt16> (SRWconst <typ.UInt16> x [16]) x))
(SumBytes8 x) -> (SumBytes4 (ADDW <typ.UInt32> (SRDconst <typ.UInt32> x [32]) x))
(Bswap64 x) -> (MOVDBR x)
(Bswap32 x) -> (MOVWBR x)

View File

@ -530,6 +530,25 @@ func init() {
clobberFlags: true,
},
// population count
//
// Counts the number of ones in each byte of arg0
// and places the result into the corresponding byte
// of the result.
{
name: "POPCNT",
argLength: 1,
reg: gp11,
asm: "POPCNT",
typ: "UInt64",
clobberFlags: true,
},
// pseudo operations to sum the output of the POPCNT instruction
{name: "SumBytes2", argLength: 1, typ: "UInt8"}, // sum the rightmost 2 bytes in arg0 ignoring overflow
{name: "SumBytes4", argLength: 1, typ: "UInt8"}, // sum the rightmost 4 bytes in arg0 ignoring overflow
{name: "SumBytes8", argLength: 1, typ: "UInt8"}, // sum all the bytes in arg0 ignoring overflow
// store multiple
{
name: "STMG2",

View File

@ -1898,6 +1898,10 @@ const (
OpS390XLoweredAtomicExchange32
OpS390XLoweredAtomicExchange64
OpS390XFLOGR
OpS390XPOPCNT
OpS390XSumBytes2
OpS390XSumBytes4
OpS390XSumBytes8
OpS390XSTMG2
OpS390XSTMG3
OpS390XSTMG4
@ -25473,6 +25477,35 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "POPCNT",
argLen: 1,
clobberFlags: true,
asm: s390x.APOPCNT,
reg: regInfo{
inputs: []inputInfo{
{0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
},
outputs: []outputInfo{
{0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
},
},
},
{
name: "SumBytes2",
argLen: 1,
reg: regInfo{},
},
{
name: "SumBytes4",
argLen: 1,
reg: regInfo{},
},
{
name: "SumBytes8",
argLen: 1,
reg: regInfo{},
},
{
name: "STMG2",
auxType: auxSymOff,

View File

@ -383,6 +383,14 @@ func rewriteValueS390X(v *Value) bool {
return rewriteValueS390X_OpOr8_0(v)
case OpOrB:
return rewriteValueS390X_OpOrB_0(v)
case OpPopCount16:
return rewriteValueS390X_OpPopCount16_0(v)
case OpPopCount32:
return rewriteValueS390X_OpPopCount32_0(v)
case OpPopCount64:
return rewriteValueS390X_OpPopCount64_0(v)
case OpPopCount8:
return rewriteValueS390X_OpPopCount8_0(v)
case OpRound:
return rewriteValueS390X_OpRound_0(v)
case OpRound32F:
@ -691,6 +699,12 @@ func rewriteValueS390X(v *Value) bool {
return rewriteValueS390X_OpS390XSUBconst_0(v)
case OpS390XSUBload:
return rewriteValueS390X_OpS390XSUBload_0(v)
case OpS390XSumBytes2:
return rewriteValueS390X_OpS390XSumBytes2_0(v)
case OpS390XSumBytes4:
return rewriteValueS390X_OpS390XSumBytes4_0(v)
case OpS390XSumBytes8:
return rewriteValueS390X_OpS390XSumBytes8_0(v)
case OpS390XXOR:
return rewriteValueS390X_OpS390XXOR_0(v) || rewriteValueS390X_OpS390XXOR_10(v)
case OpS390XXORW:
@ -5311,6 +5325,80 @@ func rewriteValueS390X_OpOrB_0(v *Value) bool {
return true
}
}
func rewriteValueS390X_OpPopCount16_0(v *Value) bool {
b := v.Block
_ = b
typ := &b.Func.Config.Types
_ = typ
// match: (PopCount16 x)
// cond:
// result: (MOVBZreg (SumBytes2 (POPCNT <typ.UInt16> x)))
for {
x := v.Args[0]
v.reset(OpS390XMOVBZreg)
v0 := b.NewValue0(v.Pos, OpS390XSumBytes2, typ.UInt8)
v1 := b.NewValue0(v.Pos, OpS390XPOPCNT, typ.UInt16)
v1.AddArg(x)
v0.AddArg(v1)
v.AddArg(v0)
return true
}
}
func rewriteValueS390X_OpPopCount32_0(v *Value) bool {
b := v.Block
_ = b
typ := &b.Func.Config.Types
_ = typ
// match: (PopCount32 x)
// cond:
// result: (MOVBZreg (SumBytes4 (POPCNT <typ.UInt32> x)))
for {
x := v.Args[0]
v.reset(OpS390XMOVBZreg)
v0 := b.NewValue0(v.Pos, OpS390XSumBytes4, typ.UInt8)
v1 := b.NewValue0(v.Pos, OpS390XPOPCNT, typ.UInt32)
v1.AddArg(x)
v0.AddArg(v1)
v.AddArg(v0)
return true
}
}
func rewriteValueS390X_OpPopCount64_0(v *Value) bool {
b := v.Block
_ = b
typ := &b.Func.Config.Types
_ = typ
// match: (PopCount64 x)
// cond:
// result: (MOVBZreg (SumBytes8 (POPCNT <typ.UInt64> x)))
for {
x := v.Args[0]
v.reset(OpS390XMOVBZreg)
v0 := b.NewValue0(v.Pos, OpS390XSumBytes8, typ.UInt8)
v1 := b.NewValue0(v.Pos, OpS390XPOPCNT, typ.UInt64)
v1.AddArg(x)
v0.AddArg(v1)
v.AddArg(v0)
return true
}
}
func rewriteValueS390X_OpPopCount8_0(v *Value) bool {
b := v.Block
_ = b
typ := &b.Func.Config.Types
_ = typ
// match: (PopCount8 x)
// cond:
// result: (POPCNT (MOVBZreg x))
for {
x := v.Args[0]
v.reset(OpS390XPOPCNT)
v0 := b.NewValue0(v.Pos, OpS390XMOVBZreg, typ.UInt64)
v0.AddArg(x)
v.AddArg(v0)
return true
}
}
func rewriteValueS390X_OpRound_0(v *Value) bool {
// match: (Round x)
// cond:
@ -40417,6 +40505,67 @@ func rewriteValueS390X_OpS390XSUBload_0(v *Value) bool {
}
return false
}
func rewriteValueS390X_OpS390XSumBytes2_0(v *Value) bool {
b := v.Block
_ = b
typ := &b.Func.Config.Types
_ = typ
// match: (SumBytes2 x)
// cond:
// result: (ADDW (SRWconst <typ.UInt8> x [8]) x)
for {
x := v.Args[0]
v.reset(OpS390XADDW)
v0 := b.NewValue0(v.Pos, OpS390XSRWconst, typ.UInt8)
v0.AuxInt = 8
v0.AddArg(x)
v.AddArg(v0)
v.AddArg(x)
return true
}
}
func rewriteValueS390X_OpS390XSumBytes4_0(v *Value) bool {
b := v.Block
_ = b
typ := &b.Func.Config.Types
_ = typ
// match: (SumBytes4 x)
// cond:
// result: (SumBytes2 (ADDW <typ.UInt16> (SRWconst <typ.UInt16> x [16]) x))
for {
x := v.Args[0]
v.reset(OpS390XSumBytes2)
v0 := b.NewValue0(v.Pos, OpS390XADDW, typ.UInt16)
v1 := b.NewValue0(v.Pos, OpS390XSRWconst, typ.UInt16)
v1.AuxInt = 16
v1.AddArg(x)
v0.AddArg(v1)
v0.AddArg(x)
v.AddArg(v0)
return true
}
}
func rewriteValueS390X_OpS390XSumBytes8_0(v *Value) bool {
b := v.Block
_ = b
typ := &b.Func.Config.Types
_ = typ
// match: (SumBytes8 x)
// cond:
// result: (SumBytes4 (ADDW <typ.UInt32> (SRDconst <typ.UInt32> x [32]) x))
for {
x := v.Args[0]
v.reset(OpS390XSumBytes4)
v0 := b.NewValue0(v.Pos, OpS390XADDW, typ.UInt32)
v1 := b.NewValue0(v.Pos, OpS390XSRDconst, typ.UInt32)
v1.AuxInt = 32
v1.AddArg(x)
v0.AddArg(v1)
v0.AddArg(x)
v.AddArg(v0)
return true
}
}
func rewriteValueS390X_OpS390XXOR_0(v *Value) bool {
// match: (XOR x (MOVDconst [c]))
// cond: isU32Bit(c)

View File

@ -271,6 +271,9 @@ const (
// find leftmost one
AFLOGR
// population count
APOPCNT
// integer bitwise
AAND
AANDW

View File

@ -45,6 +45,7 @@ var Anames = []string{
"MOVDLT",
"MOVDNE",
"FLOGR",
"POPCNT",
"AND",
"ANDW",
"OR",

View File

@ -246,6 +246,9 @@ var optab = []Optab{
// find leftmost one
Optab{AFLOGR, C_REG, C_NONE, C_NONE, C_REG, 8, 0},
// population count
Optab{APOPCNT, C_REG, C_NONE, C_NONE, C_REG, 9, 0},
// compare
Optab{ACMP, C_REG, C_NONE, C_NONE, C_REG, 70, 0},
Optab{ACMP, C_REG, C_NONE, C_NONE, C_LCON, 71, 0},
@ -2849,6 +2852,9 @@ func (c *ctxtz) asmout(p *obj.Prog, asm *[]byte) {
// FLOGR also writes a mask to p.To.Reg+1.
zRRE(op_FLOGR, uint32(p.To.Reg), uint32(p.From.Reg), asm)
case 9: // population count
zRRE(op_POPCNT, uint32(p.To.Reg), uint32(p.From.Reg), asm)
case 10: // subtract reg [reg] reg
r := int(p.Reg)

View File

@ -103,27 +103,36 @@ func Len8(n uint8) int {
func OnesCount(n uint) int {
// amd64:"POPCNTQ",".*support_popcnt"
// arm64:"VCNT","VUADDLV"
// s390x:"POPCNT"
return bits.OnesCount(n)
}
func OnesCount64(n uint64) int {
// amd64:"POPCNTQ",".*support_popcnt"
// arm64:"VCNT","VUADDLV"
// s390x:"POPCNT"
return bits.OnesCount64(n)
}
func OnesCount32(n uint32) int {
// amd64:"POPCNTL",".*support_popcnt"
// arm64:"VCNT","VUADDLV"
// s390x:"POPCNT"
return bits.OnesCount32(n)
}
func OnesCount16(n uint16) int {
// amd64:"POPCNTL",".*support_popcnt"
// arm64:"VCNT","VUADDLV"
// s390x:"POPCNT"
return bits.OnesCount16(n)
}
func OnesCount8(n uint8) int {
// s390x:"POPCNT"
return bits.OnesCount8(n)
}
// ----------------------- //
// bits.ReverseBytes //
// ----------------------- //