mirror of
https://github.com/golang/go
synced 2024-11-15 00:50:32 -07:00
cmd/compile: optimize Ctz64 on 386
Compared with the version generated by dec64.rules based on Ctz32, the number of assembly instructions is reduced by half. SwissMap uses TrailingZeros64 to find the first match in its control group and may benefit from this CL on 386 architectures. goos: linux goarch: 386 cpu: 13th Gen Intel(R) Core(TM) i7-13700H │ old.txt │ new.txt │ │ sec/op │ sec/op vs base │ TrailingZeros64-20 0.8828n ± 1% 0.6299n ± 1% -28.65% (p=0.000 n=20) Change-Id: Iba08a3f4e13efd3349715dfb7fcd5fd470286cd3 Reviewed-on: https://go-review.googlesource.com/c/go/+/624376 Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Keith Randall <khr@golang.org> Reviewed-by: Keith Randall <khr@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Auto-Submit: Keith Randall <khr@golang.org>
This commit is contained in:
parent
bea9b91f0f
commit
0140aae6d0
@ -63,6 +63,7 @@
|
||||
(Ctz16NonZero ...) => (BSFL ...)
|
||||
(Ctz32 ...) => (LoweredCtz32 ...)
|
||||
(Ctz32NonZero ...) => (BSFL ...)
|
||||
(Ctz64On32 ...) => (LoweredCtz64 ...)
|
||||
|
||||
// Lowering extension
|
||||
(SignExt8to16 ...) => (MOVBLSX ...)
|
||||
|
@ -300,9 +300,10 @@ func init() {
|
||||
|
||||
{name: "NOTL", argLength: 1, reg: gp11, asm: "NOTL", resultInArg0: true}, // ^arg0
|
||||
|
||||
{name: "BSFL", argLength: 1, reg: gp11, asm: "BSFL", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero
|
||||
{name: "BSFW", argLength: 1, reg: gp11, asm: "BSFW", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero
|
||||
{name: "LoweredCtz32", argLength: 1, reg: gp11, clobberFlags: true}, // arg0 # of low-order zeroes
|
||||
{name: "BSFL", argLength: 1, reg: gp11, asm: "BSFL", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero
|
||||
{name: "BSFW", argLength: 1, reg: gp11, asm: "BSFW", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero
|
||||
{name: "LoweredCtz32", argLength: 1, reg: gp11, clobberFlags: true}, // arg0 # of low-order zeroes
|
||||
{name: "LoweredCtz64", argLength: 2, reg: gp21, resultNotInArgs: true, clobberFlags: true}, // arg1<<32+arg0 # of low-order zeroes
|
||||
|
||||
{name: "BSRL", argLength: 1, reg: gp11, asm: "BSRL", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero
|
||||
{name: "BSRW", argLength: 1, reg: gp11, asm: "BSRW", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero
|
||||
|
@ -229,6 +229,7 @@ var genericOps = []opData{
|
||||
{name: "Ctz16", argLength: 1}, // Count trailing (low order) zeroes (returns 0-16)
|
||||
{name: "Ctz32", argLength: 1}, // Count trailing (low order) zeroes (returns 0-32)
|
||||
{name: "Ctz64", argLength: 1}, // Count trailing (low order) zeroes (returns 0-64)
|
||||
{name: "Ctz64On32", argLength: 2}, // Count trailing (low order) zeroes (returns 0-64) in arg[1]<<32+arg[0]
|
||||
{name: "Ctz8NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-7
|
||||
{name: "Ctz16NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-15
|
||||
{name: "Ctz32NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-31
|
||||
|
@ -469,6 +469,7 @@ const (
|
||||
Op386BSFL
|
||||
Op386BSFW
|
||||
Op386LoweredCtz32
|
||||
Op386LoweredCtz64
|
||||
Op386BSRL
|
||||
Op386BSRW
|
||||
Op386BSWAPL
|
||||
@ -3093,6 +3094,7 @@ const (
|
||||
OpCtz16
|
||||
OpCtz32
|
||||
OpCtz64
|
||||
OpCtz64On32
|
||||
OpCtz8NonZero
|
||||
OpCtz16NonZero
|
||||
OpCtz32NonZero
|
||||
@ -5195,6 +5197,21 @@ var opcodeTable = [...]opInfo{
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "LoweredCtz64",
|
||||
argLen: 2,
|
||||
resultNotInArgs: true,
|
||||
clobberFlags: true,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 239}, // AX CX DX BX BP SI DI
|
||||
{1, 239}, // AX CX DX BX BP SI DI
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 239}, // AX CX DX BX BP SI DI
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "BSRL",
|
||||
argLen: 1,
|
||||
@ -40458,6 +40475,11 @@ var opcodeTable = [...]opInfo{
|
||||
argLen: 1,
|
||||
generic: true,
|
||||
},
|
||||
{
|
||||
name: "Ctz64On32",
|
||||
argLen: 2,
|
||||
generic: true,
|
||||
},
|
||||
{
|
||||
name: "Ctz8NonZero",
|
||||
argLen: 1,
|
||||
|
@ -323,6 +323,9 @@ func rewriteValue386(v *Value) bool {
|
||||
case OpCtz32NonZero:
|
||||
v.Op = Op386BSFL
|
||||
return true
|
||||
case OpCtz64On32:
|
||||
v.Op = Op386LoweredCtz64
|
||||
return true
|
||||
case OpCtz8:
|
||||
return rewriteValue386_OpCtz8(v)
|
||||
case OpCtz8NonZero:
|
||||
|
@ -747,7 +747,14 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
|
||||
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
|
||||
return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0])
|
||||
},
|
||||
sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
|
||||
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
|
||||
addF("math/bits", "TrailingZeros64",
|
||||
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
|
||||
lo := s.newValue1(ssa.OpInt64Lo, types.Types[types.TUINT32], args[0])
|
||||
hi := s.newValue1(ssa.OpInt64Hi, types.Types[types.TUINT32], args[0])
|
||||
return s.newValue2(ssa.OpCtz64On32, types.Types[types.TINT], lo, hi)
|
||||
},
|
||||
sys.I386)
|
||||
addF("math/bits", "TrailingZeros32",
|
||||
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
|
||||
return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0])
|
||||
|
@ -850,6 +850,54 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
|
||||
p2.To.Type = obj.TYPE_REG
|
||||
p2.To.Reg = v.Reg()
|
||||
|
||||
// NOP (so the JNZ has somewhere to land)
|
||||
nop := s.Prog(obj.ANOP)
|
||||
p1.To.SetTarget(nop)
|
||||
case ssa.Op386LoweredCtz64:
|
||||
if v.Args[0].Reg() == v.Reg() {
|
||||
v.Fatalf("input[0] and output in the same register %s", v.LongString())
|
||||
}
|
||||
if v.Args[1].Reg() == v.Reg() {
|
||||
v.Fatalf("input[1] and output in the same register %s", v.LongString())
|
||||
}
|
||||
|
||||
// BSFL arg0, out
|
||||
p := s.Prog(x86.ABSFL)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = v.Args[0].Reg()
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = v.Reg()
|
||||
|
||||
// JNZ 5(PC)
|
||||
p1 := s.Prog(x86.AJNE)
|
||||
p1.To.Type = obj.TYPE_BRANCH
|
||||
|
||||
// BSFL arg1, out
|
||||
p2 := s.Prog(x86.ABSFL)
|
||||
p2.From.Type = obj.TYPE_REG
|
||||
p2.From.Reg = v.Args[1].Reg()
|
||||
p2.To.Type = obj.TYPE_REG
|
||||
p2.To.Reg = v.Reg()
|
||||
|
||||
// JNZ 2(PC)
|
||||
p3 := s.Prog(x86.AJNE)
|
||||
p3.To.Type = obj.TYPE_BRANCH
|
||||
|
||||
// MOVL $32, out
|
||||
p4 := s.Prog(x86.AMOVL)
|
||||
p4.From.Type = obj.TYPE_CONST
|
||||
p4.From.Offset = 32
|
||||
p4.To.Type = obj.TYPE_REG
|
||||
p4.To.Reg = v.Reg()
|
||||
|
||||
// ADDL $32, out
|
||||
p5 := s.Prog(x86.AADDL)
|
||||
p5.From.Type = obj.TYPE_CONST
|
||||
p5.From.Offset = 32
|
||||
p5.To.Type = obj.TYPE_REG
|
||||
p5.To.Reg = v.Reg()
|
||||
p3.To.SetTarget(p5)
|
||||
|
||||
// NOP (so the JNZ has somewhere to land)
|
||||
nop := s.Prog(obj.ANOP)
|
||||
p1.To.SetTarget(nop)
|
||||
|
Loading…
Reference in New Issue
Block a user