mirror of
https://github.com/golang/go
synced 2024-11-11 19:51:37 -07:00
cmd/compile: optimize multi-register shifts on amd64
amd64 can shift in bits from another register instead of filling with 0/1. This pattern is helpful when implementing 128 bit shifts or arbitrary length shifts. In the standard library, it shows up in pure Go math/big. Benchmarks results on amd64 with -tags=math_big_pure_go. name old time/op new time/op delta NonZeroShifts/1/shrVU-8 4.45ns ± 3% 4.39ns ± 1% -1.28% (p=0.000 n=30+27) NonZeroShifts/1/shlVU-8 4.13ns ± 4% 4.10ns ± 2% ~ (p=0.254 n=29+28) NonZeroShifts/2/shrVU-8 5.55ns ± 1% 5.63ns ± 2% +1.42% (p=0.000 n=28+29) NonZeroShifts/2/shlVU-8 5.70ns ± 2% 5.14ns ± 1% -9.82% (p=0.000 n=29+28) NonZeroShifts/3/shrVU-8 6.79ns ± 2% 6.35ns ± 2% -6.46% (p=0.000 n=28+29) NonZeroShifts/3/shlVU-8 6.69ns ± 1% 6.25ns ± 1% -6.60% (p=0.000 n=28+27) NonZeroShifts/4/shrVU-8 7.79ns ± 2% 7.06ns ± 2% -9.48% (p=0.000 n=30+30) NonZeroShifts/4/shlVU-8 7.82ns ± 1% 7.24ns ± 1% -7.37% (p=0.000 n=28+29) NonZeroShifts/5/shrVU-8 8.90ns ± 3% 7.93ns ± 1% -10.84% (p=0.000 n=29+26) NonZeroShifts/5/shlVU-8 8.68ns ± 1% 7.92ns ± 1% -8.76% (p=0.000 n=29+29) NonZeroShifts/10/shrVU-8 14.4ns ± 1% 12.3ns ± 2% -14.79% (p=0.000 n=28+29) NonZeroShifts/10/shlVU-8 14.1ns ± 1% 11.9ns ± 2% -15.55% (p=0.000 n=28+27) NonZeroShifts/100/shrVU-8 118ns ± 1% 96ns ± 3% -18.82% (p=0.000 n=30+29) NonZeroShifts/100/shlVU-8 120ns ± 2% 98ns ± 2% -18.46% (p=0.000 n=29+28) NonZeroShifts/1000/shrVU-8 1.10µs ± 1% 0.88µs ± 2% -19.63% (p=0.000 n=29+30) NonZeroShifts/1000/shlVU-8 1.10µs ± 2% 0.88µs ± 2% -20.28% (p=0.000 n=29+28) NonZeroShifts/10000/shrVU-8 10.9µs ± 1% 8.7µs ± 1% -19.78% (p=0.000 n=28+27) NonZeroShifts/10000/shlVU-8 10.9µs ± 2% 8.7µs ± 1% -19.64% (p=0.000 n=29+27) NonZeroShifts/100000/shrVU-8 111µs ± 2% 90µs ± 2% -19.39% (p=0.000 n=28+29) NonZeroShifts/100000/shlVU-8 113µs ± 2% 90µs ± 2% -20.43% (p=0.000 n=30+27) The assembly version is still faster, unfortunately, but the gap is narrowing. Speedup from pure Go to assembly: name old time/op new time/op delta NonZeroShifts/1/shrVU-8 4.39ns ± 1% 3.45ns ± 2% -21.36% (p=0.000 n=27+29) NonZeroShifts/1/shlVU-8 4.10ns ± 2% 3.47ns ± 3% -15.42% (p=0.000 n=28+30) NonZeroShifts/2/shrVU-8 5.63ns ± 2% 3.97ns ± 0% -29.40% (p=0.000 n=29+25) NonZeroShifts/2/shlVU-8 5.14ns ± 1% 3.77ns ± 2% -26.65% (p=0.000 n=28+26) NonZeroShifts/3/shrVU-8 6.35ns ± 2% 4.79ns ± 2% -24.52% (p=0.000 n=29+29) NonZeroShifts/3/shlVU-8 6.25ns ± 1% 4.42ns ± 1% -29.29% (p=0.000 n=27+26) NonZeroShifts/4/shrVU-8 7.06ns ± 2% 5.64ns ± 1% -20.05% (p=0.000 n=30+29) NonZeroShifts/4/shlVU-8 7.24ns ± 1% 5.34ns ± 2% -26.23% (p=0.000 n=29+29) NonZeroShifts/5/shrVU-8 7.93ns ± 1% 6.56ns ± 2% -17.26% (p=0.000 n=26+30) NonZeroShifts/5/shlVU-8 7.92ns ± 1% 6.27ns ± 1% -20.79% (p=0.000 n=29+25) NonZeroShifts/10/shrVU-8 12.3ns ± 2% 10.2ns ± 2% -17.21% (p=0.000 n=29+29) NonZeroShifts/10/shlVU-8 11.9ns ± 2% 10.5ns ± 2% -12.45% (p=0.000 n=27+29) NonZeroShifts/100/shrVU-8 95.9ns ± 3% 77.7ns ± 1% -19.00% (p=0.000 n=29+30) NonZeroShifts/100/shlVU-8 97.5ns ± 2% 66.8ns ± 2% -31.47% (p=0.000 n=28+30) NonZeroShifts/1000/shrVU-8 884ns ± 2% 705ns ± 1% -20.17% (p=0.000 n=30+28) NonZeroShifts/1000/shlVU-8 880ns ± 2% 590ns ± 1% -32.96% (p=0.000 n=28+25) NonZeroShifts/10000/shrVU-8 8.74µs ± 1% 7.34µs ± 3% -15.94% (p=0.000 n=27+30) NonZeroShifts/10000/shlVU-8 8.73µs ± 1% 6.00µs ± 1% -31.25% (p=0.000 n=27+28) NonZeroShifts/100000/shrVU-8 89.6µs ± 2% 75.5µs ± 2% -15.80% (p=0.000 n=29+29) NonZeroShifts/100000/shlVU-8 89.6µs ± 2% 68.0µs ± 3% -24.09% (p=0.000 n=27+30) Change-Id: I18f58d8f5513d737d9cdf09b8f9d14011ffe3958 Reviewed-on: https://go-review.googlesource.com/c/go/+/297050 Trust: Josh Bleecher Snyder <josharian@gmail.com> Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
parent
b0733ba12d
commit
43d5f213e2
@ -253,6 +253,15 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
|
||||
ssa.OpAMD64BTRL, ssa.OpAMD64BTRQ:
|
||||
opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
|
||||
|
||||
case ssa.OpAMD64SHRDQ, ssa.OpAMD64SHLDQ:
|
||||
p := s.Prog(v.Op.Asm())
|
||||
lo, hi, bits := v.Args[0].Reg(), v.Args[1].Reg(), v.Args[2].Reg()
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = bits
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = lo
|
||||
p.SetFrom3Reg(hi)
|
||||
|
||||
case ssa.OpAMD64DIVQU, ssa.OpAMD64DIVLU, ssa.OpAMD64DIVWU:
|
||||
// Arg[0] (the dividend) is in AX.
|
||||
// Arg[1] (the divisor) can be in any other register.
|
||||
|
@ -902,6 +902,9 @@
|
||||
((SHRB|SARB)const x [0]) => x
|
||||
((ROLQ|ROLL|ROLW|ROLB)const x [0]) => x
|
||||
|
||||
// Multi-register shifts
|
||||
(ORQ (SH(R|L)Q lo bits) (SH(L|R)Q hi (NEGQ bits))) => (SH(R|L)DQ lo hi bits)
|
||||
|
||||
// Note: the word and byte shifts keep the low 5 bits (not the low 4 or 3 bits)
|
||||
// because the x86 instructions are defined to use all 5 bits of the shift even
|
||||
// for the small shifts. I don't think we'll ever generate a weird shift (e.g.
|
||||
|
@ -122,6 +122,7 @@ func init() {
|
||||
gp21sp = regInfo{inputs: []regMask{gpsp, gp}, outputs: gponly}
|
||||
gp21sb = regInfo{inputs: []regMask{gpspsbg, gpsp}, outputs: gponly}
|
||||
gp21shift = regInfo{inputs: []regMask{gp, cx}, outputs: []regMask{gp}}
|
||||
gp31shift = regInfo{inputs: []regMask{gp, gp, cx}, outputs: []regMask{gp}}
|
||||
gp11div = regInfo{inputs: []regMask{ax, gpsp &^ dx}, outputs: []regMask{ax, dx}}
|
||||
gp21hmul = regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{dx}, clobbers: ax}
|
||||
gp21flags = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp, 0}}
|
||||
@ -408,6 +409,9 @@ func init() {
|
||||
{name: "SARWconst", argLength: 1, reg: gp11, asm: "SARW", aux: "Int8", resultInArg0: true, clobberFlags: true}, // signed int16(arg0) >> auxint, shift amount 0-15
|
||||
{name: "SARBconst", argLength: 1, reg: gp11, asm: "SARB", aux: "Int8", resultInArg0: true, clobberFlags: true}, // signed int8(arg0) >> auxint, shift amount 0-7
|
||||
|
||||
{name: "SHRDQ", argLength: 3, reg: gp31shift, asm: "SHRQ", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> arg2, shifting in bits from arg1 (==(arg1<<64+arg0)>>arg2, keeping low 64 bits), shift amount is mod 64
|
||||
{name: "SHLDQ", argLength: 3, reg: gp31shift, asm: "SHLQ", resultInArg0: true, clobberFlags: true}, // unsigned arg0 << arg2, shifting in bits from arg1 (==(arg0<<64+arg1)<<arg2, keeping high 64 bits), shift amount is mod 64
|
||||
|
||||
{name: "ROLQ", argLength: 2, reg: gp21shift, asm: "ROLQ", resultInArg0: true, clobberFlags: true}, // arg0 rotate left arg1 bits.
|
||||
{name: "ROLL", argLength: 2, reg: gp21shift, asm: "ROLL", resultInArg0: true, clobberFlags: true}, // arg0 rotate left arg1 bits.
|
||||
{name: "ROLW", argLength: 2, reg: gp21shift, asm: "ROLW", resultInArg0: true, clobberFlags: true}, // arg0 rotate left arg1 bits.
|
||||
|
@ -732,6 +732,8 @@ const (
|
||||
OpAMD64SARLconst
|
||||
OpAMD64SARWconst
|
||||
OpAMD64SARBconst
|
||||
OpAMD64SHRDQ
|
||||
OpAMD64SHLDQ
|
||||
OpAMD64ROLQ
|
||||
OpAMD64ROLL
|
||||
OpAMD64ROLW
|
||||
@ -9101,6 +9103,40 @@ var opcodeTable = [...]opInfo{
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "SHRDQ",
|
||||
argLen: 3,
|
||||
resultInArg0: true,
|
||||
clobberFlags: true,
|
||||
asm: x86.ASHRQ,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{2, 2}, // CX
|
||||
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
|
||||
{1, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "SHLDQ",
|
||||
argLen: 3,
|
||||
resultInArg0: true,
|
||||
clobberFlags: true,
|
||||
asm: x86.ASHLQ,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{2, 2}, // CX
|
||||
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
|
||||
{1, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "ROLQ",
|
||||
argLen: 2,
|
||||
|
@ -18743,6 +18743,54 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
|
||||
}
|
||||
break
|
||||
}
|
||||
// match: (ORQ (SHRQ lo bits) (SHLQ hi (NEGQ bits)))
|
||||
// result: (SHRDQ lo hi bits)
|
||||
for {
|
||||
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
|
||||
if v_0.Op != OpAMD64SHRQ {
|
||||
continue
|
||||
}
|
||||
bits := v_0.Args[1]
|
||||
lo := v_0.Args[0]
|
||||
if v_1.Op != OpAMD64SHLQ {
|
||||
continue
|
||||
}
|
||||
_ = v_1.Args[1]
|
||||
hi := v_1.Args[0]
|
||||
v_1_1 := v_1.Args[1]
|
||||
if v_1_1.Op != OpAMD64NEGQ || bits != v_1_1.Args[0] {
|
||||
continue
|
||||
}
|
||||
v.reset(OpAMD64SHRDQ)
|
||||
v.AddArg3(lo, hi, bits)
|
||||
return true
|
||||
}
|
||||
break
|
||||
}
|
||||
// match: (ORQ (SHLQ lo bits) (SHRQ hi (NEGQ bits)))
|
||||
// result: (SHLDQ lo hi bits)
|
||||
for {
|
||||
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
|
||||
if v_0.Op != OpAMD64SHLQ {
|
||||
continue
|
||||
}
|
||||
bits := v_0.Args[1]
|
||||
lo := v_0.Args[0]
|
||||
if v_1.Op != OpAMD64SHRQ {
|
||||
continue
|
||||
}
|
||||
_ = v_1.Args[1]
|
||||
hi := v_1.Args[0]
|
||||
v_1_1 := v_1.Args[1]
|
||||
if v_1_1.Op != OpAMD64NEGQ || bits != v_1_1.Args[0] {
|
||||
continue
|
||||
}
|
||||
v.reset(OpAMD64SHLDQ)
|
||||
v.AddArg3(lo, hi, bits)
|
||||
return true
|
||||
}
|
||||
break
|
||||
}
|
||||
// match: (ORQ (MOVQconst [c]) (MOVQconst [d]))
|
||||
// result: (MOVQconst [c|d])
|
||||
for {
|
||||
|
@ -288,3 +288,16 @@ func checkMergedShifts32(a [256]uint32, b [256]uint64, u uint32, v uint32) {
|
||||
//ppc64: -"SLD", "RLWNM\t[$]10, R[0-9]+, [$]22, [$]28, R[0-9]+"
|
||||
b[2] = b[v>>25]
|
||||
}
|
||||
|
||||
// 128 bit shifts
|
||||
|
||||
func check128bitShifts(x, y uint64, bits uint) (uint64, uint64) {
|
||||
s := bits & 63
|
||||
ŝ := (64 - bits) & 63
|
||||
// check that the shift operation has two commas (three operands)
|
||||
// amd64:"SHRQ.*,.*,"
|
||||
shr := x>>s | y<<ŝ
|
||||
// amd64:"SHLQ.*,.*,"
|
||||
shl := x<<s | y>>ŝ
|
||||
return shr, shl
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user