1
0
mirror of https://github.com/golang/go synced 2024-11-15 00:30:31 -07:00

cmd/compile: optimize Ctz64 on 386

Compared with the version generated by dec64.rules based on Ctz32,
the number of assembly instructions is reduced by half.

SwissMap uses TrailingZeros64 to find the first match in its control
group and may benefit from this CL on 386 architectures.

goos: linux
goarch: 386
cpu: 13th Gen Intel(R) Core(TM) i7-13700H
                   │   old.txt    │               new.txt                │
                   │    sec/op    │    sec/op     vs base                │
TrailingZeros64-20   0.8828n ± 1%   0.6299n ± 1%  -28.65% (p=0.000 n=20)

Change-Id: Iba08a3f4e13efd3349715dfb7fcd5fd470286cd3
Reviewed-on: https://go-review.googlesource.com/c/go/+/624376
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Keith Randall <khr@golang.org>
This commit is contained in:
Youlin Feng 2024-10-22 17:18:11 +08:00 committed by Gopher Robot
parent bea9b91f0f
commit 0140aae6d0
7 changed files with 87 additions and 4 deletions

View File

@ -63,6 +63,7 @@
(Ctz16NonZero ...) => (BSFL ...)
(Ctz32 ...) => (LoweredCtz32 ...)
(Ctz32NonZero ...) => (BSFL ...)
(Ctz64On32 ...) => (LoweredCtz64 ...)
// Lowering extension
(SignExt8to16 ...) => (MOVBLSX ...)

View File

@ -303,6 +303,7 @@ func init() {
{name: "BSFL", argLength: 1, reg: gp11, asm: "BSFL", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero
{name: "BSFW", argLength: 1, reg: gp11, asm: "BSFW", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero
{name: "LoweredCtz32", argLength: 1, reg: gp11, clobberFlags: true}, // arg0 # of low-order zeroes
{name: "LoweredCtz64", argLength: 2, reg: gp21, resultNotInArgs: true, clobberFlags: true}, // arg1<<32+arg0 # of low-order zeroes
{name: "BSRL", argLength: 1, reg: gp11, asm: "BSRL", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero
{name: "BSRW", argLength: 1, reg: gp11, asm: "BSRW", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero

View File

@ -229,6 +229,7 @@ var genericOps = []opData{
{name: "Ctz16", argLength: 1}, // Count trailing (low order) zeroes (returns 0-16)
{name: "Ctz32", argLength: 1}, // Count trailing (low order) zeroes (returns 0-32)
{name: "Ctz64", argLength: 1}, // Count trailing (low order) zeroes (returns 0-64)
{name: "Ctz64On32", argLength: 2}, // Count trailing (low order) zeroes (returns 0-64) in arg[1]<<32+arg[0]
{name: "Ctz8NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-7
{name: "Ctz16NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-15
{name: "Ctz32NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-31

View File

@ -469,6 +469,7 @@ const (
Op386BSFL
Op386BSFW
Op386LoweredCtz32
Op386LoweredCtz64
Op386BSRL
Op386BSRW
Op386BSWAPL
@ -3093,6 +3094,7 @@ const (
OpCtz16
OpCtz32
OpCtz64
OpCtz64On32
OpCtz8NonZero
OpCtz16NonZero
OpCtz32NonZero
@ -5195,6 +5197,21 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "LoweredCtz64",
argLen: 2,
resultNotInArgs: true,
clobberFlags: true,
reg: regInfo{
inputs: []inputInfo{
{0, 239}, // AX CX DX BX BP SI DI
{1, 239}, // AX CX DX BX BP SI DI
},
outputs: []outputInfo{
{0, 239}, // AX CX DX BX BP SI DI
},
},
},
{
name: "BSRL",
argLen: 1,
@ -40458,6 +40475,11 @@ var opcodeTable = [...]opInfo{
argLen: 1,
generic: true,
},
{
name: "Ctz64On32",
argLen: 2,
generic: true,
},
{
name: "Ctz8NonZero",
argLen: 1,

View File

@ -323,6 +323,9 @@ func rewriteValue386(v *Value) bool {
case OpCtz32NonZero:
v.Op = Op386BSFL
return true
case OpCtz64On32:
v.Op = Op386LoweredCtz64
return true
case OpCtz8:
return rewriteValue386_OpCtz8(v)
case OpCtz8NonZero:

View File

@ -747,7 +747,14 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0])
},
sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
addF("math/bits", "TrailingZeros64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
lo := s.newValue1(ssa.OpInt64Lo, types.Types[types.TUINT32], args[0])
hi := s.newValue1(ssa.OpInt64Hi, types.Types[types.TUINT32], args[0])
return s.newValue2(ssa.OpCtz64On32, types.Types[types.TINT], lo, hi)
},
sys.I386)
addF("math/bits", "TrailingZeros32",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0])

View File

@ -850,6 +850,54 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
p2.To.Type = obj.TYPE_REG
p2.To.Reg = v.Reg()
// NOP (so the JNZ has somewhere to land)
nop := s.Prog(obj.ANOP)
p1.To.SetTarget(nop)
case ssa.Op386LoweredCtz64:
if v.Args[0].Reg() == v.Reg() {
v.Fatalf("input[0] and output in the same register %s", v.LongString())
}
if v.Args[1].Reg() == v.Reg() {
v.Fatalf("input[1] and output in the same register %s", v.LongString())
}
// BSFL arg0, out
p := s.Prog(x86.ABSFL)
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
// JNZ 5(PC)
p1 := s.Prog(x86.AJNE)
p1.To.Type = obj.TYPE_BRANCH
// BSFL arg1, out
p2 := s.Prog(x86.ABSFL)
p2.From.Type = obj.TYPE_REG
p2.From.Reg = v.Args[1].Reg()
p2.To.Type = obj.TYPE_REG
p2.To.Reg = v.Reg()
// JNZ 2(PC)
p3 := s.Prog(x86.AJNE)
p3.To.Type = obj.TYPE_BRANCH
// MOVL $32, out
p4 := s.Prog(x86.AMOVL)
p4.From.Type = obj.TYPE_CONST
p4.From.Offset = 32
p4.To.Type = obj.TYPE_REG
p4.To.Reg = v.Reg()
// ADDL $32, out
p5 := s.Prog(x86.AADDL)
p5.From.Type = obj.TYPE_CONST
p5.From.Offset = 32
p5.To.Type = obj.TYPE_REG
p5.To.Reg = v.Reg()
p3.To.SetTarget(p5)
// NOP (so the JNZ has somewhere to land)
nop := s.Prog(obj.ANOP)
p1.To.SetTarget(nop)