1
0
mirror of https://github.com/golang/go synced 2024-09-29 22:34:33 -06:00

cmd/compile: enable carry chain scheduling for arm64

This is a follow up of CL 393656 on arm64.

This CL puts ScoreCarryChainTail before ScoreMemory and after
ScoreReadFlags, so that the scheduling of the carry chain will not
break the scheduling of ScoreVarDef.

Benchmarks:
name                                  old time/op    new time/op    delta
ScalarMult/P256-8                       42.0µs ± 0%    42.0µs ± 0%   -0.13%  (p=0.032 n=5+5)
ScalarMult/P224-8                        135µs ± 0%      96µs ± 0%  -29.04%  (p=0.008 n=5+5)
ScalarMult/P384-8                        573µs ± 1%     355µs ± 0%  -38.05%  (p=0.008 n=5+5)
ScalarMult/P521-8                       1.50ms ± 4%    0.77ms ± 0%  -48.78%  (p=0.008 n=5+5)
MarshalUnmarshal/P256/Uncompressed-8     505ns ± 1%     506ns ± 0%     ~     (p=0.460 n=5+5)
MarshalUnmarshal/P256/Compressed-8      6.75µs ± 0%    6.73µs ± 0%   -0.27%  (p=0.016 n=5+5)
MarshalUnmarshal/P224/Uncompressed-8     927ns ± 0%     818ns ± 0%  -11.76%  (p=0.008 n=5+5)
MarshalUnmarshal/P224/Compressed-8       136µs ± 0%      96µs ± 0%  -29.58%  (p=0.008 n=5+5)
MarshalUnmarshal/P384/Uncompressed-8    1.77µs ± 0%    1.36µs ± 1%  -23.14%  (p=0.008 n=5+5)
MarshalUnmarshal/P384/Compressed-8      56.5µs ± 0%    31.9µs ± 0%  -43.59%  (p=0.016 n=5+4)
MarshalUnmarshal/P521/Uncompressed-8    2.91µs ± 0%    2.03µs ± 1%  -30.32%  (p=0.008 n=5+5)
MarshalUnmarshal/P521/Compressed-8       148µs ± 0%      68µs ± 1%  -54.28%  (p=0.008 n=5+5)

Change-Id: I4bf4e3265d7e1ee85765ff2bf006ca5a794d4979
Reviewed-on: https://go-review.googlesource.com/c/go/+/432275
Reviewed-by: Carlos Amedee <carlos@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
Run-TryBot: Eric Fang <eric.fang@arm.com>
This commit is contained in:
eric fang 2022-08-18 09:59:35 +00:00 committed by Eric Fang
parent 40c7e94cc5
commit 669ec549b5
2 changed files with 119 additions and 29 deletions

View File

@ -14,10 +14,10 @@ const (
ScorePhi = iota // towards top of block
ScoreArg
ScoreNilCheck
ScoreCarryChainTail
ScoreReadTuple
ScoreVarDef
ScoreMemory
ScoreCarryChainTail
ScoreReadFlags
ScoreDefault
ScoreFlags
@ -155,7 +155,7 @@ func schedule(f *Func) {
// VARDEF ops are scheduled before the corresponding LEA.
score[v.ID] = ScoreMemory
case v.Op == OpSelect0 || v.Op == OpSelect1 || v.Op == OpSelectN:
if (v.Op == OpSelect1 || v.Op == OpSelect0) && (v.Args[0].Op.isCarry() || v.Type.IsFlags()) {
if (v.Op == OpSelect1 || v.Op == OpSelect0) && (v.Args[0].isCarry() || v.Type.IsFlags()) {
// When the Select pseudo op is being used for a carry or flag from
// a tuple then score it as ScoreFlags so it happens later. This
// prevents the bit from being clobbered before it is used.
@ -163,8 +163,8 @@ func schedule(f *Func) {
} else {
score[v.ID] = ScoreReadTuple
}
case v.Op.isCarry():
if w := v.getCarryProducer(); w != nil {
case v.isCarry():
if w := v.getCarryInput(); w != nil && w.Block == b {
// The producing op is not the final user of the carry bit. Its
// current score is one of unscored, Flags, or CarryChainTail.
// These occur if the producer has not been scored, another user
@ -183,7 +183,7 @@ func schedule(f *Func) {
// one chain to be scheduled, if possible.
score[v.ID] = ScoreCarryChainTail
}
case v.Type.IsFlags() || v.Type.IsTuple() && v.Type.FieldType(1).IsFlags():
case v.isFlagOp():
// Schedule flag register generation as late as possible.
// This makes sure that we only have one live flags
// value at a time.
@ -192,7 +192,7 @@ func schedule(f *Func) {
score[v.ID] = ScoreDefault
// If we're reading flags, schedule earlier to keep flag lifetime short.
for _, a := range v.Args {
if a.Type.IsFlags() {
if a.isFlagOp() {
score[v.ID] = ScoreReadFlags
}
}
@ -263,7 +263,6 @@ func schedule(f *Func) {
}
}
}
}
// To put things into a priority queue
@ -287,7 +286,7 @@ func schedule(f *Func) {
v := heap.Pop(priq).(*Value)
if f.pass.debug > 1 && score[v.ID] == ScoreCarryChainTail && v.Op.isCarry() {
if f.pass.debug > 1 && score[v.ID] == ScoreCarryChainTail && v.isCarry() {
// Add some debugging noise if the chain of carrying ops will not
// likely be scheduled without potential carry flag clobbers.
if !isCarryChainReady(v, uses) {
@ -551,42 +550,74 @@ func storeOrder(values []*Value, sset *sparseSet, storeNumber []int32) []*Value
return order
}
// Return whether all dependent carry ops can be scheduled after this.
// isFlagOp reports if v is an OP with the flag type.
func (v *Value) isFlagOp() bool {
return v.Type.IsFlags() || v.Type.IsTuple() && v.Type.FieldType(1).IsFlags()
}
// isCarryChainReady reports whether all dependent carry ops can be scheduled after this.
func isCarryChainReady(v *Value, uses []int32) bool {
// A chain can be scheduled in it's entirety if
// the use count of each dependent op is 1. If none,
// schedule the first.
j := 1 // The first op uses[k.ID] == 0. Dependent ops are always >= 1.
for k := v; k != nil; k = k.getCarryProducer() {
for k := v; k != nil; k = k.getCarryInput() {
j += int(uses[k.ID]) - 1
}
return j == 0
}
// Return whether op is an operation which produces a carry bit value, but does not consume it.
func (op Op) isCarryCreator() bool {
switch op {
case OpPPC64SUBC, OpPPC64ADDC, OpPPC64SUBCconst, OpPPC64ADDCconst:
return true
}
return false
// isCarryInput reports whether v accepts a carry value as input.
func (v *Value) isCarryInput() bool {
return v.getCarryInput() != nil
}
// Return whether op consumes or creates a carry a bit value.
func (op Op) isCarry() bool {
switch op {
case OpPPC64SUBE, OpPPC64ADDE, OpPPC64SUBZEzero, OpPPC64ADDZEzero:
return true
// isCarryOutput reports whether v generates a carry as output.
func (v *Value) isCarryOutput() bool {
// special cases for PPC64 which put their carry values in XER instead of flags
switch v.Block.Func.Config.arch {
case "ppc64", "ppc64le":
switch v.Op {
case OpPPC64SUBC, OpPPC64ADDC, OpPPC64SUBCconst, OpPPC64ADDCconst:
return true
}
return false
}
return op.isCarryCreator()
return v.isFlagOp() && v.Op != OpSelect1
}
// Return the producing *Value of the carry bit of this op, or nil if none.
func (v *Value) getCarryProducer() *Value {
if v.Op.isCarry() && !v.Op.isCarryCreator() {
// PPC64 carry dependencies are conveyed through their final argument.
// Likewise, there is always an OpSelect1 between them.
return v.Args[len(v.Args)-1].Args[0]
// isCarryCreator reports whether op is an operation which produces a carry bit value,
// but does not consume it.
func (v *Value) isCarryCreator() bool {
return v.isCarryOutput() && !v.isCarryInput()
}
// isCarry reports whether op consumes or creates a carry a bit value.
func (v *Value) isCarry() bool {
return v.isCarryOutput() || v.isCarryInput()
}
// getCarryInput returns the producing *Value of the carry bit of this op, or nil if none.
func (v *Value) getCarryInput() *Value {
// special cases for PPC64 which put their carry values in XER instead of flags
switch v.Block.Func.Config.arch {
case "ppc64", "ppc64le":
switch v.Op {
case OpPPC64SUBE, OpPPC64ADDE, OpPPC64SUBZEzero, OpPPC64ADDZEzero:
// PPC64 carry dependencies are conveyed through their final argument.
// Likewise, there is always an OpSelect1 between them.
return v.Args[len(v.Args)-1].Args[0]
}
return nil
}
for _, a := range v.Args {
if !a.isFlagOp() {
continue
}
if a.Op == OpSelect1 {
a = a.Args[0]
}
return a
}
return nil
}

View File

@ -99,3 +99,62 @@ func TestStoreOrder(t *testing.T) {
t.Errorf("store order is wrong: got %v, want v2 v3 v4 after v5", order)
}
}
func TestCarryChainOrder(t *testing.T) {
// In the function below, there are two carry chains that have no dependencies on each other,
// one is A1 -> A1carry -> A1Carryvalue, the other is A2 -> A2carry -> A2Carryvalue. If they
// are not scheduled properly, the carry will be clobbered, causing the carry to be regenerated.
c := testConfigARM64(t)
fun := c.Fun("entry",
Bloc("entry",
Valu("mem0", OpInitMem, types.TypeMem, 0, nil),
Valu("x", OpARM64MOVDconst, c.config.Types.UInt64, 5, nil),
Valu("y", OpARM64MOVDconst, c.config.Types.UInt64, 6, nil),
Valu("z", OpARM64MOVDconst, c.config.Types.UInt64, 7, nil),
Valu("A1", OpARM64ADDSflags, types.NewTuple(c.config.Types.UInt64, types.TypeFlags), 0, nil, "x", "z"), // x+z, set flags
Valu("A1carry", OpSelect1, types.TypeFlags, 0, nil, "A1"),
Valu("A2", OpARM64ADDSflags, types.NewTuple(c.config.Types.UInt64, types.TypeFlags), 0, nil, "y", "z"), // y+z, set flags
Valu("A2carry", OpSelect1, types.TypeFlags, 0, nil, "A2"),
Valu("A1value", OpSelect0, c.config.Types.UInt64, 0, nil, "A1"),
Valu("A1Carryvalue", OpARM64ADCzerocarry, c.config.Types.UInt64, 0, nil, "A1carry"), // 0+0+A1carry
Valu("A2value", OpSelect0, c.config.Types.UInt64, 0, nil, "A2"),
Valu("A2Carryvalue", OpARM64ADCzerocarry, c.config.Types.UInt64, 0, nil, "A2carry"), // 0+0+A2carry
Valu("ValueSum", OpARM64ADD, c.config.Types.UInt64, 0, nil, "A1value", "A2value"),
Valu("CarrySum", OpARM64ADD, c.config.Types.UInt64, 0, nil, "A1Carryvalue", "A2Carryvalue"),
Valu("Sum", OpARM64AND, c.config.Types.UInt64, 0, nil, "ValueSum", "CarrySum"),
Goto("exit")),
Bloc("exit",
Exit("mem0")),
)
CheckFunc(fun.f)
schedule(fun.f)
// The expected order is A1 < A1carry < A1Carryvalue < A2 < A2carry < A2Carryvalue.
// There is no dependency between the two carry chains, so it doesn't matter which
// comes first and which comes after, but the unsorted position of A1 is before A2,
// so A1Carryvalue < A2.
var ai, bi, ci, di, ei, fi int
for i, v := range fun.f.Blocks[0].Values {
switch {
case fun.values["A1"] == v:
ai = i
case fun.values["A1carry"] == v:
bi = i
case fun.values["A1Carryvalue"] == v:
ci = i
case fun.values["A2"] == v:
di = i
case fun.values["A2carry"] == v:
ei = i
case fun.values["A2Carryvalue"] == v:
fi = i
}
}
if !(ai < bi && bi < ci && ci < di && di < ei && ei < fi) {
t.Logf("Func: %s", fun.f)
t.Errorf("carry chain order is wrong: got %v, want V%d after V%d after V%d after V%d after V%d after V%d,",
fun.f.Blocks[0], fun.values["A1"].ID, fun.values["A1carry"].ID, fun.values["A1Carryvalue"].ID,
fun.values["A2"].ID, fun.values["A2carry"].ID, fun.values["A2Carryvalue"].ID)
}
}