From 669ec549b554ef7fe957bf284271ed3b7db82da1 Mon Sep 17 00:00:00 2001 From: eric fang Date: Thu, 18 Aug 2022 09:59:35 +0000 Subject: [PATCH] cmd/compile: enable carry chain scheduling for arm64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a follow up of CL 393656 on arm64. This CL puts ScoreCarryChainTail before ScoreMemory and after ScoreReadFlags, so that the scheduling of the carry chain will not break the scheduling of ScoreVarDef. Benchmarks: name old time/op new time/op delta ScalarMult/P256-8 42.0µs ± 0% 42.0µs ± 0% -0.13% (p=0.032 n=5+5) ScalarMult/P224-8 135µs ± 0% 96µs ± 0% -29.04% (p=0.008 n=5+5) ScalarMult/P384-8 573µs ± 1% 355µs ± 0% -38.05% (p=0.008 n=5+5) ScalarMult/P521-8 1.50ms ± 4% 0.77ms ± 0% -48.78% (p=0.008 n=5+5) MarshalUnmarshal/P256/Uncompressed-8 505ns ± 1% 506ns ± 0% ~ (p=0.460 n=5+5) MarshalUnmarshal/P256/Compressed-8 6.75µs ± 0% 6.73µs ± 0% -0.27% (p=0.016 n=5+5) MarshalUnmarshal/P224/Uncompressed-8 927ns ± 0% 818ns ± 0% -11.76% (p=0.008 n=5+5) MarshalUnmarshal/P224/Compressed-8 136µs ± 0% 96µs ± 0% -29.58% (p=0.008 n=5+5) MarshalUnmarshal/P384/Uncompressed-8 1.77µs ± 0% 1.36µs ± 1% -23.14% (p=0.008 n=5+5) MarshalUnmarshal/P384/Compressed-8 56.5µs ± 0% 31.9µs ± 0% -43.59% (p=0.016 n=5+4) MarshalUnmarshal/P521/Uncompressed-8 2.91µs ± 0% 2.03µs ± 1% -30.32% (p=0.008 n=5+5) MarshalUnmarshal/P521/Compressed-8 148µs ± 0% 68µs ± 1% -54.28% (p=0.008 n=5+5) Change-Id: I4bf4e3265d7e1ee85765ff2bf006ca5a794d4979 Reviewed-on: https://go-review.googlesource.com/c/go/+/432275 Reviewed-by: Carlos Amedee Reviewed-by: Keith Randall TryBot-Result: Gopher Robot Reviewed-by: Keith Randall Run-TryBot: Eric Fang --- src/cmd/compile/internal/ssa/schedule.go | 89 +++++++++++++------ src/cmd/compile/internal/ssa/schedule_test.go | 59 ++++++++++++ 2 files changed, 119 insertions(+), 29 deletions(-) diff --git a/src/cmd/compile/internal/ssa/schedule.go b/src/cmd/compile/internal/ssa/schedule.go index ebf84d59b3..62eaa2ed45 100644 --- a/src/cmd/compile/internal/ssa/schedule.go +++ b/src/cmd/compile/internal/ssa/schedule.go @@ -14,10 +14,10 @@ const ( ScorePhi = iota // towards top of block ScoreArg ScoreNilCheck - ScoreCarryChainTail ScoreReadTuple ScoreVarDef ScoreMemory + ScoreCarryChainTail ScoreReadFlags ScoreDefault ScoreFlags @@ -155,7 +155,7 @@ func schedule(f *Func) { // VARDEF ops are scheduled before the corresponding LEA. score[v.ID] = ScoreMemory case v.Op == OpSelect0 || v.Op == OpSelect1 || v.Op == OpSelectN: - if (v.Op == OpSelect1 || v.Op == OpSelect0) && (v.Args[0].Op.isCarry() || v.Type.IsFlags()) { + if (v.Op == OpSelect1 || v.Op == OpSelect0) && (v.Args[0].isCarry() || v.Type.IsFlags()) { // When the Select pseudo op is being used for a carry or flag from // a tuple then score it as ScoreFlags so it happens later. This // prevents the bit from being clobbered before it is used. @@ -163,8 +163,8 @@ func schedule(f *Func) { } else { score[v.ID] = ScoreReadTuple } - case v.Op.isCarry(): - if w := v.getCarryProducer(); w != nil { + case v.isCarry(): + if w := v.getCarryInput(); w != nil && w.Block == b { // The producing op is not the final user of the carry bit. Its // current score is one of unscored, Flags, or CarryChainTail. // These occur if the producer has not been scored, another user @@ -183,7 +183,7 @@ func schedule(f *Func) { // one chain to be scheduled, if possible. score[v.ID] = ScoreCarryChainTail } - case v.Type.IsFlags() || v.Type.IsTuple() && v.Type.FieldType(1).IsFlags(): + case v.isFlagOp(): // Schedule flag register generation as late as possible. // This makes sure that we only have one live flags // value at a time. @@ -192,7 +192,7 @@ func schedule(f *Func) { score[v.ID] = ScoreDefault // If we're reading flags, schedule earlier to keep flag lifetime short. for _, a := range v.Args { - if a.Type.IsFlags() { + if a.isFlagOp() { score[v.ID] = ScoreReadFlags } } @@ -263,7 +263,6 @@ func schedule(f *Func) { } } } - } // To put things into a priority queue @@ -287,7 +286,7 @@ func schedule(f *Func) { v := heap.Pop(priq).(*Value) - if f.pass.debug > 1 && score[v.ID] == ScoreCarryChainTail && v.Op.isCarry() { + if f.pass.debug > 1 && score[v.ID] == ScoreCarryChainTail && v.isCarry() { // Add some debugging noise if the chain of carrying ops will not // likely be scheduled without potential carry flag clobbers. if !isCarryChainReady(v, uses) { @@ -551,42 +550,74 @@ func storeOrder(values []*Value, sset *sparseSet, storeNumber []int32) []*Value return order } -// Return whether all dependent carry ops can be scheduled after this. +// isFlagOp reports if v is an OP with the flag type. +func (v *Value) isFlagOp() bool { + return v.Type.IsFlags() || v.Type.IsTuple() && v.Type.FieldType(1).IsFlags() +} + +// isCarryChainReady reports whether all dependent carry ops can be scheduled after this. func isCarryChainReady(v *Value, uses []int32) bool { // A chain can be scheduled in it's entirety if // the use count of each dependent op is 1. If none, // schedule the first. j := 1 // The first op uses[k.ID] == 0. Dependent ops are always >= 1. - for k := v; k != nil; k = k.getCarryProducer() { + for k := v; k != nil; k = k.getCarryInput() { j += int(uses[k.ID]) - 1 } return j == 0 } -// Return whether op is an operation which produces a carry bit value, but does not consume it. -func (op Op) isCarryCreator() bool { - switch op { - case OpPPC64SUBC, OpPPC64ADDC, OpPPC64SUBCconst, OpPPC64ADDCconst: - return true - } - return false +// isCarryInput reports whether v accepts a carry value as input. +func (v *Value) isCarryInput() bool { + return v.getCarryInput() != nil } -// Return whether op consumes or creates a carry a bit value. -func (op Op) isCarry() bool { - switch op { - case OpPPC64SUBE, OpPPC64ADDE, OpPPC64SUBZEzero, OpPPC64ADDZEzero: - return true +// isCarryOutput reports whether v generates a carry as output. +func (v *Value) isCarryOutput() bool { + // special cases for PPC64 which put their carry values in XER instead of flags + switch v.Block.Func.Config.arch { + case "ppc64", "ppc64le": + switch v.Op { + case OpPPC64SUBC, OpPPC64ADDC, OpPPC64SUBCconst, OpPPC64ADDCconst: + return true + } + return false } - return op.isCarryCreator() + return v.isFlagOp() && v.Op != OpSelect1 } -// Return the producing *Value of the carry bit of this op, or nil if none. -func (v *Value) getCarryProducer() *Value { - if v.Op.isCarry() && !v.Op.isCarryCreator() { - // PPC64 carry dependencies are conveyed through their final argument. - // Likewise, there is always an OpSelect1 between them. - return v.Args[len(v.Args)-1].Args[0] +// isCarryCreator reports whether op is an operation which produces a carry bit value, +// but does not consume it. +func (v *Value) isCarryCreator() bool { + return v.isCarryOutput() && !v.isCarryInput() +} + +// isCarry reports whether op consumes or creates a carry a bit value. +func (v *Value) isCarry() bool { + return v.isCarryOutput() || v.isCarryInput() +} + +// getCarryInput returns the producing *Value of the carry bit of this op, or nil if none. +func (v *Value) getCarryInput() *Value { + // special cases for PPC64 which put their carry values in XER instead of flags + switch v.Block.Func.Config.arch { + case "ppc64", "ppc64le": + switch v.Op { + case OpPPC64SUBE, OpPPC64ADDE, OpPPC64SUBZEzero, OpPPC64ADDZEzero: + // PPC64 carry dependencies are conveyed through their final argument. + // Likewise, there is always an OpSelect1 between them. + return v.Args[len(v.Args)-1].Args[0] + } + return nil + } + for _, a := range v.Args { + if !a.isFlagOp() { + continue + } + if a.Op == OpSelect1 { + a = a.Args[0] + } + return a } return nil } diff --git a/src/cmd/compile/internal/ssa/schedule_test.go b/src/cmd/compile/internal/ssa/schedule_test.go index f7177dd704..6cf5105be1 100644 --- a/src/cmd/compile/internal/ssa/schedule_test.go +++ b/src/cmd/compile/internal/ssa/schedule_test.go @@ -99,3 +99,62 @@ func TestStoreOrder(t *testing.T) { t.Errorf("store order is wrong: got %v, want v2 v3 v4 after v5", order) } } + +func TestCarryChainOrder(t *testing.T) { + // In the function below, there are two carry chains that have no dependencies on each other, + // one is A1 -> A1carry -> A1Carryvalue, the other is A2 -> A2carry -> A2Carryvalue. If they + // are not scheduled properly, the carry will be clobbered, causing the carry to be regenerated. + c := testConfigARM64(t) + fun := c.Fun("entry", + Bloc("entry", + Valu("mem0", OpInitMem, types.TypeMem, 0, nil), + Valu("x", OpARM64MOVDconst, c.config.Types.UInt64, 5, nil), + Valu("y", OpARM64MOVDconst, c.config.Types.UInt64, 6, nil), + Valu("z", OpARM64MOVDconst, c.config.Types.UInt64, 7, nil), + Valu("A1", OpARM64ADDSflags, types.NewTuple(c.config.Types.UInt64, types.TypeFlags), 0, nil, "x", "z"), // x+z, set flags + Valu("A1carry", OpSelect1, types.TypeFlags, 0, nil, "A1"), + Valu("A2", OpARM64ADDSflags, types.NewTuple(c.config.Types.UInt64, types.TypeFlags), 0, nil, "y", "z"), // y+z, set flags + Valu("A2carry", OpSelect1, types.TypeFlags, 0, nil, "A2"), + Valu("A1value", OpSelect0, c.config.Types.UInt64, 0, nil, "A1"), + Valu("A1Carryvalue", OpARM64ADCzerocarry, c.config.Types.UInt64, 0, nil, "A1carry"), // 0+0+A1carry + Valu("A2value", OpSelect0, c.config.Types.UInt64, 0, nil, "A2"), + Valu("A2Carryvalue", OpARM64ADCzerocarry, c.config.Types.UInt64, 0, nil, "A2carry"), // 0+0+A2carry + Valu("ValueSum", OpARM64ADD, c.config.Types.UInt64, 0, nil, "A1value", "A2value"), + Valu("CarrySum", OpARM64ADD, c.config.Types.UInt64, 0, nil, "A1Carryvalue", "A2Carryvalue"), + Valu("Sum", OpARM64AND, c.config.Types.UInt64, 0, nil, "ValueSum", "CarrySum"), + Goto("exit")), + Bloc("exit", + Exit("mem0")), + ) + + CheckFunc(fun.f) + schedule(fun.f) + + // The expected order is A1 < A1carry < A1Carryvalue < A2 < A2carry < A2Carryvalue. + // There is no dependency between the two carry chains, so it doesn't matter which + // comes first and which comes after, but the unsorted position of A1 is before A2, + // so A1Carryvalue < A2. + var ai, bi, ci, di, ei, fi int + for i, v := range fun.f.Blocks[0].Values { + switch { + case fun.values["A1"] == v: + ai = i + case fun.values["A1carry"] == v: + bi = i + case fun.values["A1Carryvalue"] == v: + ci = i + case fun.values["A2"] == v: + di = i + case fun.values["A2carry"] == v: + ei = i + case fun.values["A2Carryvalue"] == v: + fi = i + } + } + if !(ai < bi && bi < ci && ci < di && di < ei && ei < fi) { + t.Logf("Func: %s", fun.f) + t.Errorf("carry chain order is wrong: got %v, want V%d after V%d after V%d after V%d after V%d after V%d,", + fun.f.Blocks[0], fun.values["A1"].ID, fun.values["A1carry"].ID, fun.values["A1Carryvalue"].ID, + fun.values["A2"].ID, fun.values["A2carry"].ID, fun.values["A2Carryvalue"].ID) + } +}