2018-03-08 02:57:10 -07:00
|
|
|
// asmcheck
|
|
|
|
|
|
|
|
// Copyright 2018 The Go Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
|
|
|
|
package codegen
|
|
|
|
|
|
|
|
// This file contains codegen tests related to arithmetic
|
2018-03-15 04:15:22 -06:00
|
|
|
// simplifications and optimizations on integer types.
|
|
|
|
// For codegen tests on float types, see floats.go.
|
2018-03-08 02:57:10 -07:00
|
|
|
|
2018-04-15 03:31:39 -06:00
|
|
|
// ----------------- //
|
|
|
|
// Subtraction //
|
|
|
|
// ----------------- //
|
|
|
|
|
2018-10-06 07:13:48 -06:00
|
|
|
var ef int
|
2019-03-13 14:52:17 -06:00
|
|
|
|
2018-10-06 07:13:48 -06:00
|
|
|
func SubMem(arr []int, b, c, d int) int {
|
2018-04-29 04:42:14 -06:00
|
|
|
// 386:`SUBL\s[A-Z]+,\s8\([A-Z]+\)`
|
2018-06-28 20:11:53 -06:00
|
|
|
// amd64:`SUBQ\s[A-Z]+,\s16\([A-Z]+\)`
|
2018-04-29 04:42:14 -06:00
|
|
|
arr[2] -= b
|
|
|
|
// 386:`SUBL\s[A-Z]+,\s12\([A-Z]+\)`
|
2018-06-28 20:11:53 -06:00
|
|
|
// amd64:`SUBQ\s[A-Z]+,\s24\([A-Z]+\)`
|
2018-04-29 04:42:14 -06:00
|
|
|
arr[3] -= b
|
2018-06-25 20:58:54 -06:00
|
|
|
// 386:`DECL\s16\([A-Z]+\)`
|
|
|
|
arr[4]--
|
|
|
|
// 386:`ADDL\s[$]-20,\s20\([A-Z]+\)`
|
|
|
|
arr[5] -= 20
|
2020-03-24 10:46:49 -06:00
|
|
|
// 386:`SUBL\s\([A-Z]+\)\([A-Z]+\*4\),\s[A-Z]+`
|
2018-10-06 07:13:48 -06:00
|
|
|
ef -= arr[b]
|
2020-03-24 10:46:49 -06:00
|
|
|
// 386:`SUBL\s[A-Z]+,\s\([A-Z]+\)\([A-Z]+\*4\)`
|
2018-10-06 07:13:48 -06:00
|
|
|
arr[c] -= b
|
2020-03-24 10:46:49 -06:00
|
|
|
// 386:`ADDL\s[$]-15,\s\([A-Z]+\)\([A-Z]+\*4\)`
|
2018-10-06 07:13:48 -06:00
|
|
|
arr[d] -= 15
|
2020-03-24 10:46:49 -06:00
|
|
|
// 386:`DECL\s\([A-Z]+\)\([A-Z]+\*4\)`
|
2018-10-09 05:01:34 -06:00
|
|
|
arr[b]--
|
2018-10-29 02:34:42 -06:00
|
|
|
// amd64:`DECQ\s64\([A-Z]+\)`
|
|
|
|
arr[8]--
|
2018-04-15 03:31:39 -06:00
|
|
|
// 386:"SUBL\t4"
|
|
|
|
// amd64:"SUBQ\t8"
|
|
|
|
return arr[0] - arr[1]
|
|
|
|
}
|
|
|
|
|
2020-08-20 14:06:23 -06:00
|
|
|
func SubFromConst(a int) int {
|
|
|
|
// ppc64le: `SUBC\tR[0-9]+,\s[$]40,\sR`
|
|
|
|
// ppc64: `SUBC\tR[0-9]+,\s[$]40,\sR`
|
|
|
|
b := 40 - a
|
|
|
|
return b
|
|
|
|
}
|
|
|
|
|
|
|
|
func SubFromConstNeg(a int) int {
|
|
|
|
// ppc64le: `ADD\t[$]40,\sR[0-9]+,\sR`
|
|
|
|
// ppc64: `ADD\t[$]40,\sR[0-9]+,\sR`
|
|
|
|
c := 40 - (-a)
|
|
|
|
return c
|
|
|
|
}
|
|
|
|
|
|
|
|
func SubSubFromConst(a int) int {
|
|
|
|
// ppc64le: `ADD\t[$]20,\sR[0-9]+,\sR`
|
|
|
|
// ppc64: `ADD\t[$]20,\sR[0-9]+,\sR`
|
|
|
|
c := 40 - (20 - a)
|
|
|
|
return c
|
|
|
|
}
|
|
|
|
|
|
|
|
func AddSubFromConst(a int) int {
|
|
|
|
// ppc64le: `SUBC\tR[0-9]+,\s[$]60,\sR`
|
|
|
|
// ppc64: `SUBC\tR[0-9]+,\s[$]60,\sR`
|
|
|
|
c := 40 + (20 - a)
|
|
|
|
return c
|
|
|
|
}
|
|
|
|
|
|
|
|
func NegSubFromConst(a int) int {
|
|
|
|
// ppc64le: `ADD\t[$]-20,\sR[0-9]+,\sR`
|
|
|
|
// ppc64: `ADD\t[$]-20,\sR[0-9]+,\sR`
|
|
|
|
c := -(20 - a)
|
|
|
|
return c
|
|
|
|
}
|
|
|
|
|
|
|
|
func NegAddFromConstNeg(a int) int {
|
|
|
|
// ppc64le: `SUBC\tR[0-9]+,\s[$]40,\sR`
|
|
|
|
// ppc64: `SUBC\tR[0-9]+,\s[$]40,\sR`
|
|
|
|
c := -(-40 + a)
|
|
|
|
return c
|
|
|
|
}
|
|
|
|
|
2018-03-14 04:47:34 -06:00
|
|
|
// -------------------- //
|
|
|
|
// Multiplication //
|
|
|
|
// -------------------- //
|
|
|
|
|
2018-03-08 02:57:10 -07:00
|
|
|
func Pow2Muls(n1, n2 int) (int, int) {
|
|
|
|
// amd64:"SHLQ\t[$]5",-"IMULQ"
|
|
|
|
// 386:"SHLL\t[$]5",-"IMULL"
|
|
|
|
// arm:"SLL\t[$]5",-"MUL"
|
|
|
|
// arm64:"LSL\t[$]5",-"MUL"
|
2018-10-15 10:53:07 -06:00
|
|
|
// ppc64:"SLD\t[$]5",-"MUL"
|
|
|
|
// ppc64le:"SLD\t[$]5",-"MUL"
|
2018-03-08 02:57:10 -07:00
|
|
|
a := n1 * 32
|
|
|
|
|
|
|
|
// amd64:"SHLQ\t[$]6",-"IMULQ"
|
|
|
|
// 386:"SHLL\t[$]6",-"IMULL"
|
|
|
|
// arm:"SLL\t[$]6",-"MUL"
|
2018-09-27 00:21:14 -06:00
|
|
|
// arm64:`NEG\sR[0-9]+<<6,\sR[0-9]+`,-`LSL`,-`MUL`
|
2018-10-15 10:53:07 -06:00
|
|
|
// ppc64:"SLD\t[$]6","NEG\\sR[0-9]+,\\sR[0-9]+",-"MUL"
|
|
|
|
// ppc64le:"SLD\t[$]6","NEG\\sR[0-9]+,\\sR[0-9]+",-"MUL"
|
2018-03-08 02:57:10 -07:00
|
|
|
b := -64 * n2
|
|
|
|
|
|
|
|
return a, b
|
|
|
|
}
|
2018-03-09 06:51:30 -07:00
|
|
|
|
2018-04-09 06:24:35 -06:00
|
|
|
func Mul_96(n int) int {
|
2018-11-30 02:30:36 -07:00
|
|
|
// amd64:`SHLQ\t[$]5`,`LEAQ\t\(.*\)\(.*\*2\),`,-`IMULQ`
|
|
|
|
// 386:`SHLL\t[$]5`,`LEAL\t\(.*\)\(.*\*2\),`,-`IMULL`
|
|
|
|
// arm64:`LSL\t[$]5`,`ADD\sR[0-9]+<<1,\sR[0-9]+`,-`MUL`
|
|
|
|
// arm:`SLL\t[$]5`,`ADD\sR[0-9]+<<1,\sR[0-9]+`,-`MUL`
|
2020-06-04 11:55:01 -06:00
|
|
|
// s390x:`SLD\t[$]5`,`SLD\t[$]6`,-`MULLD`
|
2018-04-09 06:24:35 -06:00
|
|
|
return n * 96
|
|
|
|
}
|
|
|
|
|
2020-06-04 11:55:01 -06:00
|
|
|
func Mul_n120(n int) int {
|
|
|
|
// s390x:`SLD\t[$]3`,`SLD\t[$]7`,-`MULLD`
|
|
|
|
return n * -120
|
|
|
|
}
|
|
|
|
|
2018-06-24 01:04:21 -06:00
|
|
|
func MulMemSrc(a []uint32, b []float32) {
|
|
|
|
// 386:`IMULL\s4\([A-Z]+\),\s[A-Z]+`
|
|
|
|
a[0] *= a[1]
|
2020-10-06 15:42:15 -06:00
|
|
|
// 386/sse2:`MULSS\s4\([A-Z]+\),\sX[0-9]+`
|
2018-11-30 02:30:36 -07:00
|
|
|
// amd64:`MULSS\s4\([A-Z]+\),\sX[0-9]+`
|
2018-06-24 01:04:21 -06:00
|
|
|
b[0] *= b[1]
|
|
|
|
}
|
|
|
|
|
2018-03-14 04:47:34 -06:00
|
|
|
// Multiplications merging tests
|
2018-03-09 06:51:30 -07:00
|
|
|
|
|
|
|
func MergeMuls1(n int) int {
|
2018-03-09 13:09:46 -07:00
|
|
|
// amd64:"IMUL3Q\t[$]46"
|
|
|
|
// 386:"IMUL3L\t[$]46"
|
2018-03-09 06:51:30 -07:00
|
|
|
return 15*n + 31*n // 46n
|
|
|
|
}
|
|
|
|
|
|
|
|
func MergeMuls2(n int) int {
|
2021-04-12 12:00:49 -06:00
|
|
|
// amd64:"IMUL3Q\t[$]23","(ADDQ\t[$]29)|(LEAQ\t29)"
|
2018-03-09 13:09:46 -07:00
|
|
|
// 386:"IMUL3L\t[$]23","ADDL\t[$]29"
|
2018-03-09 06:51:30 -07:00
|
|
|
return 5*n + 7*(n+1) + 11*(n+2) // 23n + 29
|
|
|
|
}
|
|
|
|
|
|
|
|
func MergeMuls3(a, n int) int {
|
|
|
|
// amd64:"ADDQ\t[$]19",-"IMULQ\t[$]19"
|
|
|
|
// 386:"ADDL\t[$]19",-"IMULL\t[$]19"
|
|
|
|
return a*n + 19*n // (a+19)n
|
|
|
|
}
|
|
|
|
|
|
|
|
func MergeMuls4(n int) int {
|
2018-03-09 13:09:46 -07:00
|
|
|
// amd64:"IMUL3Q\t[$]14"
|
|
|
|
// 386:"IMUL3L\t[$]14"
|
2018-03-09 06:51:30 -07:00
|
|
|
return 23*n - 9*n // 14n
|
|
|
|
}
|
|
|
|
|
|
|
|
func MergeMuls5(a, n int) int {
|
|
|
|
// amd64:"ADDQ\t[$]-19",-"IMULQ\t[$]19"
|
|
|
|
// 386:"ADDL\t[$]-19",-"IMULL\t[$]19"
|
|
|
|
return a*n - 19*n // (a-19)n
|
|
|
|
}
|
2018-03-14 04:47:34 -06:00
|
|
|
|
|
|
|
// -------------- //
|
|
|
|
// Division //
|
|
|
|
// -------------- //
|
|
|
|
|
2018-06-24 01:04:21 -06:00
|
|
|
func DivMemSrc(a []float64) {
|
2020-10-06 15:42:15 -06:00
|
|
|
// 386/sse2:`DIVSD\s8\([A-Z]+\),\sX[0-9]+`
|
2018-11-30 02:30:36 -07:00
|
|
|
// amd64:`DIVSD\s8\([A-Z]+\),\sX[0-9]+`
|
2018-06-24 01:04:21 -06:00
|
|
|
a[0] /= a[1]
|
|
|
|
}
|
|
|
|
|
2018-03-14 04:47:34 -06:00
|
|
|
func Pow2Divs(n1 uint, n2 int) (uint, int) {
|
|
|
|
// 386:"SHRL\t[$]5",-"DIVL"
|
|
|
|
// amd64:"SHRQ\t[$]5",-"DIVQ"
|
|
|
|
// arm:"SRL\t[$]5",-".*udiv"
|
|
|
|
// arm64:"LSR\t[$]5",-"UDIV"
|
2018-10-15 10:53:07 -06:00
|
|
|
// ppc64:"SRD"
|
|
|
|
// ppc64le:"SRD"
|
2018-03-14 04:47:34 -06:00
|
|
|
a := n1 / 32 // unsigned
|
|
|
|
|
|
|
|
// amd64:"SARQ\t[$]6",-"IDIVQ"
|
|
|
|
// 386:"SARL\t[$]6",-"IDIVL"
|
|
|
|
// arm:"SRA\t[$]6",-".*udiv"
|
|
|
|
// arm64:"ASR\t[$]6",-"SDIV"
|
2018-10-15 10:53:07 -06:00
|
|
|
// ppc64:"SRAD"
|
|
|
|
// ppc64le:"SRAD"
|
2018-03-14 04:47:34 -06:00
|
|
|
b := n2 / 64 // signed
|
|
|
|
|
|
|
|
return a, b
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check that constant divisions get turned into MULs
|
|
|
|
func ConstDivs(n1 uint, n2 int) (uint, int) {
|
|
|
|
// amd64:"MOVQ\t[$]-1085102592571150095","MULQ",-"DIVQ"
|
2018-11-30 02:30:36 -07:00
|
|
|
// 386:"MOVL\t[$]-252645135","MULL",-"DIVL"
|
|
|
|
// arm64:`MOVD`,`UMULH`,-`DIV`
|
|
|
|
// arm:`MOVW`,`MUL`,-`.*udiv`
|
2018-03-14 04:47:34 -06:00
|
|
|
a := n1 / 17 // unsigned
|
|
|
|
|
|
|
|
// amd64:"MOVQ\t[$]-1085102592571150095","IMULQ",-"IDIVQ"
|
2018-11-30 02:30:36 -07:00
|
|
|
// 386:"MOVL\t[$]-252645135","IMULL",-"IDIVL"
|
|
|
|
// arm64:`MOVD`,`SMULH`,-`DIV`
|
|
|
|
// arm:`MOVW`,`MUL`,-`.*udiv`
|
2018-03-14 04:47:34 -06:00
|
|
|
b := n2 / 17 // signed
|
|
|
|
|
|
|
|
return a, b
|
|
|
|
}
|
|
|
|
|
2018-06-21 04:14:18 -06:00
|
|
|
func FloatDivs(a []float32) float32 {
|
|
|
|
// amd64:`DIVSS\s8\([A-Z]+\),\sX[0-9]+`
|
2020-10-06 15:42:15 -06:00
|
|
|
// 386/sse2:`DIVSS\s8\([A-Z]+\),\sX[0-9]+`
|
2018-06-21 04:14:18 -06:00
|
|
|
return a[1] / a[2]
|
|
|
|
}
|
|
|
|
|
2018-03-14 04:47:34 -06:00
|
|
|
func Pow2Mods(n1 uint, n2 int) (uint, int) {
|
|
|
|
// 386:"ANDL\t[$]31",-"DIVL"
|
|
|
|
// amd64:"ANDQ\t[$]31",-"DIVQ"
|
|
|
|
// arm:"AND\t[$]31",-".*udiv"
|
|
|
|
// arm64:"AND\t[$]31",-"UDIV"
|
2018-10-15 10:53:07 -06:00
|
|
|
// ppc64:"ANDCC\t[$]31"
|
|
|
|
// ppc64le:"ANDCC\t[$]31"
|
2018-03-14 04:47:34 -06:00
|
|
|
a := n1 % 32 // unsigned
|
|
|
|
|
2019-04-23 22:04:38 -06:00
|
|
|
// 386:"SHRL",-"IDIVL"
|
|
|
|
// amd64:"SHRQ",-"IDIVQ"
|
|
|
|
// arm:"SRA",-".*udiv"
|
|
|
|
// arm64:"ASR",-"REM"
|
|
|
|
// ppc64:"SRAD"
|
|
|
|
// ppc64le:"SRAD"
|
2018-03-14 04:47:34 -06:00
|
|
|
b := n2 % 64 // signed
|
|
|
|
|
|
|
|
return a, b
|
|
|
|
}
|
|
|
|
|
2019-04-23 22:04:38 -06:00
|
|
|
// Check that signed divisibility checks get converted to AND on low bits
|
2019-09-08 21:50:07 -06:00
|
|
|
func Pow2DivisibleSigned(n1, n2 int) (bool, bool) {
|
2019-04-23 22:04:38 -06:00
|
|
|
// 386:"TESTL\t[$]63",-"DIVL",-"SHRL"
|
|
|
|
// amd64:"TESTQ\t[$]63",-"DIVQ",-"SHRQ"
|
|
|
|
// arm:"AND\t[$]63",-".*udiv",-"SRA"
|
|
|
|
// arm64:"AND\t[$]63",-"UDIV",-"ASR"
|
|
|
|
// ppc64:"ANDCC\t[$]63",-"SRAD"
|
|
|
|
// ppc64le:"ANDCC\t[$]63",-"SRAD"
|
2019-09-08 21:50:07 -06:00
|
|
|
a := n1%64 == 0 // signed divisible
|
|
|
|
|
|
|
|
// 386:"TESTL\t[$]63",-"DIVL",-"SHRL"
|
|
|
|
// amd64:"TESTQ\t[$]63",-"DIVQ",-"SHRQ"
|
|
|
|
// arm:"AND\t[$]63",-".*udiv",-"SRA"
|
|
|
|
// arm64:"AND\t[$]63",-"UDIV",-"ASR"
|
|
|
|
// ppc64:"ANDCC\t[$]63",-"SRAD"
|
|
|
|
// ppc64le:"ANDCC\t[$]63",-"SRAD"
|
|
|
|
b := n2%64 != 0 // signed indivisible
|
|
|
|
|
|
|
|
return a, b
|
2019-04-23 22:04:38 -06:00
|
|
|
}
|
|
|
|
|
2018-03-14 04:47:34 -06:00
|
|
|
// Check that constant modulo divs get turned into MULs
|
|
|
|
func ConstMods(n1 uint, n2 int) (uint, int) {
|
|
|
|
// amd64:"MOVQ\t[$]-1085102592571150095","MULQ",-"DIVQ"
|
2018-11-30 02:30:36 -07:00
|
|
|
// 386:"MOVL\t[$]-252645135","MULL",-"DIVL"
|
|
|
|
// arm64:`MOVD`,`UMULH`,-`DIV`
|
|
|
|
// arm:`MOVW`,`MUL`,-`.*udiv`
|
2018-03-14 04:47:34 -06:00
|
|
|
a := n1 % 17 // unsigned
|
|
|
|
|
|
|
|
// amd64:"MOVQ\t[$]-1085102592571150095","IMULQ",-"IDIVQ"
|
2018-11-30 02:30:36 -07:00
|
|
|
// 386:"MOVL\t[$]-252645135","IMULL",-"IDIVL"
|
|
|
|
// arm64:`MOVD`,`SMULH`,-`DIV`
|
|
|
|
// arm:`MOVW`,`MUL`,-`.*udiv`
|
2018-03-14 04:47:34 -06:00
|
|
|
b := n2 % 17 // signed
|
|
|
|
|
|
|
|
return a, b
|
|
|
|
}
|
2018-03-15 03:06:37 -06:00
|
|
|
|
2019-03-09 21:58:16 -07:00
|
|
|
// Check that divisibility checks x%c==0 are converted to MULs and rotates
|
2019-04-05 14:05:07 -06:00
|
|
|
func Divisible(n1 uint, n2 int) (bool, bool, bool, bool) {
|
2019-03-09 21:58:16 -07:00
|
|
|
// amd64:"MOVQ\t[$]-6148914691236517205","IMULQ","ROLQ\t[$]63",-"DIVQ"
|
|
|
|
// 386:"IMUL3L\t[$]-1431655765","ROLL\t[$]31",-"DIVQ"
|
|
|
|
// arm64:"MOVD\t[$]-6148914691236517205","MUL","ROR",-"DIV"
|
|
|
|
// arm:"MUL","CMP\t[$]715827882",-".*udiv"
|
|
|
|
// ppc64:"MULLD","ROTL\t[$]63"
|
|
|
|
// ppc64le:"MULLD","ROTL\t[$]63"
|
2019-04-05 14:05:07 -06:00
|
|
|
evenU := n1%6 == 0
|
2019-03-09 21:58:16 -07:00
|
|
|
|
|
|
|
// amd64:"MOVQ\t[$]-8737931403336103397","IMULQ",-"ROLQ",-"DIVQ"
|
|
|
|
// 386:"IMUL3L\t[$]678152731",-"ROLL",-"DIVQ"
|
|
|
|
// arm64:"MOVD\t[$]-8737931403336103397","MUL",-"ROR",-"DIV"
|
|
|
|
// arm:"MUL","CMP\t[$]226050910",-".*udiv"
|
|
|
|
// ppc64:"MULLD",-"ROTL"
|
|
|
|
// ppc64le:"MULLD",-"ROTL"
|
2019-04-05 14:05:07 -06:00
|
|
|
oddU := n1%19 == 0
|
|
|
|
|
|
|
|
// amd64:"IMULQ","ADD","ROLQ\t[$]63",-"DIVQ"
|
|
|
|
// 386:"IMUL3L\t[$]-1431655765","ADDL\t[$]715827882","ROLL\t[$]31",-"DIVQ"
|
|
|
|
// arm64:"MUL","ADD\t[$]3074457345618258602","ROR",-"DIV"
|
|
|
|
// arm:"MUL","ADD\t[$]715827882",-".*udiv"
|
2020-08-17 15:14:48 -06:00
|
|
|
// ppc64/power8:"MULLD","ADD","ROTL\t[$]63"
|
|
|
|
// ppc64le/power8:"MULLD","ADD","ROTL\t[$]63"
|
|
|
|
// ppc64/power9:"MADDLD","ROTL\t[$]63"
|
|
|
|
// ppc64le/power9:"MADDLD","ROTL\t[$]63"
|
2019-04-05 14:05:07 -06:00
|
|
|
evenS := n2%6 == 0
|
|
|
|
|
|
|
|
// amd64:"IMULQ","ADD",-"ROLQ",-"DIVQ"
|
|
|
|
// 386:"IMUL3L\t[$]678152731","ADDL\t[$]113025455",-"ROLL",-"DIVQ"
|
|
|
|
// arm64:"MUL","ADD\t[$]485440633518672410",-"ROR",-"DIV"
|
|
|
|
// arm:"MUL","ADD\t[$]113025455",-".*udiv"
|
2020-08-17 15:14:48 -06:00
|
|
|
// ppc64/power8:"MULLD","ADD",-"ROTL"
|
|
|
|
// ppc64/power9:"MADDLD",-"ROTL"
|
|
|
|
// ppc64le/power8:"MULLD","ADD",-"ROTL"
|
|
|
|
// ppc64le/power9:"MADDLD",-"ROTL"
|
2019-04-05 14:05:07 -06:00
|
|
|
oddS := n2%19 == 0
|
|
|
|
|
|
|
|
return evenU, oddU, evenS, oddS
|
2019-03-09 21:58:16 -07:00
|
|
|
}
|
|
|
|
|
2018-08-06 03:50:38 -06:00
|
|
|
// Check that fix-up code is not generated for divisions where it has been proven that
|
|
|
|
// that the divisor is not -1 or that the dividend is > MinIntNN.
|
|
|
|
func NoFix64A(divr int64) (int64, int64) {
|
|
|
|
var d int64 = 42
|
|
|
|
var e int64 = 84
|
|
|
|
if divr > 5 {
|
|
|
|
d /= divr // amd64:-"JMP"
|
|
|
|
e %= divr // amd64:-"JMP"
|
cmd/compile: use depth first topological sort algorithm for layout
The current layout algorithm tries to put consecutive blocks together,
so the priority of the successor block is higher than the priority of
the zero indegree block. This algorithm is beneficial for subsequent
register allocation, but will result in more branch instructions.
The depth-first topological sorting algorithm is a well-known layout
algorithm, which has applications in many languages, and it helps to
reduce branch instructions. This CL applies it to the layout pass.
The test results show that it helps to reduce the code size.
This CL also includes the following changes:
1, Removed the primary predecessor mechanism. The new layout algorithm is
not very friendly to register allocator in some cases, in order to adapt
to the new layout algorithm, a new primary predecessor selection strategy
is introduced.
2, Since the new layout implementation may place non-loop blocks between
loop blocks, some adaptive modifications have also been made to looprotate
pass.
3, The layout also affects the results of codegen, so this CL also adjusted
several codegen tests accordingly.
It is inevitable that this CL will cause the code size or performance of a
few functions to decrease, but the number of cases it improves is much larger
than the number of cases it drops.
Statistical data from compilecmp on linux/amd64 is as follow:
name old time/op new time/op delta
Template 382ms ± 4% 382ms ± 4% ~ (p=0.497 n=49+50)
Unicode 170ms ± 9% 169ms ± 8% ~ (p=0.344 n=48+50)
GoTypes 2.01s ± 4% 2.01s ± 4% ~ (p=0.628 n=50+48)
Compiler 190ms ±10% 189ms ± 9% ~ (p=0.734 n=50+50)
SSA 11.8s ± 2% 11.8s ± 3% ~ (p=0.877 n=50+50)
Flate 241ms ± 9% 241ms ± 8% ~ (p=0.897 n=50+49)
GoParser 366ms ± 3% 361ms ± 4% -1.21% (p=0.004 n=47+50)
Reflect 835ms ± 3% 838ms ± 3% ~ (p=0.275 n=50+49)
Tar 336ms ± 4% 335ms ± 3% ~ (p=0.454 n=48+48)
XML 433ms ± 4% 431ms ± 3% ~ (p=0.071 n=49+48)
LinkCompiler 706ms ± 4% 705ms ± 4% ~ (p=0.608 n=50+49)
ExternalLinkCompiler 1.85s ± 3% 1.83s ± 2% -1.47% (p=0.000 n=49+48)
LinkWithoutDebugCompiler 437ms ± 5% 437ms ± 6% ~ (p=0.953 n=49+50)
[Geo mean] 615ms 613ms -0.37%
name old alloc/op new alloc/op delta
Template 38.7MB ± 1% 38.7MB ± 1% ~ (p=0.834 n=50+50)
Unicode 28.1MB ± 0% 28.1MB ± 0% -0.22% (p=0.000 n=49+50)
GoTypes 168MB ± 1% 168MB ± 1% ~ (p=0.054 n=47+47)
Compiler 23.0MB ± 1% 23.0MB ± 1% ~ (p=0.432 n=50+50)
SSA 1.54GB ± 0% 1.54GB ± 0% +0.21% (p=0.000 n=50+50)
Flate 23.6MB ± 1% 23.6MB ± 1% ~ (p=0.153 n=43+46)
GoParser 35.1MB ± 1% 35.1MB ± 2% ~ (p=0.202 n=50+50)
Reflect 84.7MB ± 1% 84.7MB ± 1% ~ (p=0.333 n=48+49)
Tar 34.5MB ± 1% 34.5MB ± 1% ~ (p=0.406 n=46+49)
XML 44.3MB ± 2% 44.2MB ± 3% ~ (p=0.981 n=50+50)
LinkCompiler 131MB ± 0% 128MB ± 0% -2.74% (p=0.000 n=50+50)
ExternalLinkCompiler 120MB ± 0% 120MB ± 0% +0.01% (p=0.007 n=50+50)
LinkWithoutDebugCompiler 77.3MB ± 0% 77.3MB ± 0% -0.02% (p=0.000 n=50+50)
[Geo mean] 69.3MB 69.1MB -0.22%
file before after Δ %
addr2line 4104220 4043684 -60536 -1.475%
api 5342502 5249678 -92824 -1.737%
asm 4973785 4858257 -115528 -2.323%
buildid 2667844 2625660 -42184 -1.581%
cgo 4686849 4616313 -70536 -1.505%
compile 23667431 23268406 -399025 -1.686%
cover 4959676 4874108 -85568 -1.725%
dist 3515934 3450422 -65512 -1.863%
doc 3995581 3925469 -70112 -1.755%
fix 3379202 3318522 -60680 -1.796%
link 6743249 6629913 -113336 -1.681%
nm 4047529 3991777 -55752 -1.377%
objdump 4456151 4388151 -68000 -1.526%
pack 2435040 2398072 -36968 -1.518%
pprof 13804080 13565808 -238272 -1.726%
test2json 2690043 2645987 -44056 -1.638%
trace 10418492 10232716 -185776 -1.783%
vet 7258259 7121259 -137000 -1.888%
total 113145867 111204202 -1941665 -1.716%
The situation on linux/arm64 is as follow:
name old time/op new time/op delta
Template 280ms ± 1% 282ms ± 1% +0.75% (p=0.000 n=46+48)
Unicode 124ms ± 2% 124ms ± 2% +0.37% (p=0.045 n=50+50)
GoTypes 1.69s ± 1% 1.70s ± 1% +0.56% (p=0.000 n=49+50)
Compiler 122ms ± 1% 123ms ± 1% +0.93% (p=0.000 n=50+50)
SSA 12.6s ± 1% 12.7s ± 0% +0.72% (p=0.000 n=50+50)
Flate 170ms ± 1% 172ms ± 1% +0.97% (p=0.000 n=49+49)
GoParser 262ms ± 1% 263ms ± 1% +0.39% (p=0.000 n=49+48)
Reflect 639ms ± 1% 650ms ± 1% +1.63% (p=0.000 n=49+49)
Tar 243ms ± 1% 245ms ± 1% +0.82% (p=0.000 n=50+50)
XML 324ms ± 1% 327ms ± 1% +0.72% (p=0.000 n=50+49)
LinkCompiler 597ms ± 1% 596ms ± 1% -0.27% (p=0.001 n=48+47)
ExternalLinkCompiler 1.90s ± 1% 1.88s ± 1% -1.00% (p=0.000 n=50+50)
LinkWithoutDebugCompiler 364ms ± 1% 363ms ± 1% ~ (p=0.220 n=49+50)
[Geo mean] 485ms 488ms +0.49%
name old alloc/op new alloc/op delta
Template 38.7MB ± 0% 38.8MB ± 1% ~ (p=0.093 n=43+49)
Unicode 28.4MB ± 0% 28.4MB ± 0% +0.03% (p=0.000 n=49+45)
GoTypes 169MB ± 1% 169MB ± 1% +0.23% (p=0.010 n=50+50)
Compiler 23.2MB ± 1% 23.2MB ± 1% +0.11% (p=0.000 n=40+44)
SSA 1.54GB ± 0% 1.55GB ± 0% +0.45% (p=0.000 n=47+49)
Flate 23.8MB ± 2% 23.8MB ± 1% ~ (p=0.543 n=50+50)
GoParser 35.3MB ± 1% 35.4MB ± 1% ~ (p=0.792 n=50+50)
Reflect 85.2MB ± 1% 85.2MB ± 0% ~ (p=0.055 n=50+47)
Tar 34.5MB ± 1% 34.5MB ± 1% +0.06% (p=0.015 n=50+50)
XML 43.8MB ± 2% 43.9MB ± 2% +0.19% (p=0.000 n=48+48)
LinkCompiler 137MB ± 0% 136MB ± 0% -0.92% (p=0.000 n=50+50)
ExternalLinkCompiler 127MB ± 0% 127MB ± 0% ~ (p=0.516 n=50+50)
LinkWithoutDebugCompiler 84.0MB ± 0% 84.0MB ± 0% ~ (p=0.057 n=50+50)
[Geo mean] 70.4MB 70.4MB +0.01%
file before after Δ %
addr2line 4021557 4002933 -18624 -0.463%
api 5127847 5028503 -99344 -1.937%
asm 5034716 4936836 -97880 -1.944%
buildid 2608118 2594094 -14024 -0.538%
cgo 4488592 4398320 -90272 -2.011%
compile 22501129 22213592 -287537 -1.278%
cover 4742301 4713573 -28728 -0.606%
dist 3388071 3365311 -22760 -0.672%
doc 3802250 3776082 -26168 -0.688%
fix 3306147 3216939 -89208 -2.698%
link 6404483 6363699 -40784 -0.637%
nm 3941026 3921930 -19096 -0.485%
objdump 4383330 4295122 -88208 -2.012%
pack 2404547 2389515 -15032 -0.625%
pprof 12996234 12856818 -139416 -1.073%
test2json 2668500 2586788 -81712 -3.062%
trace 9816276 9609580 -206696 -2.106%
vet 6900682 6787338 -113344 -1.643%
total 108535806 107056973 -1478833 -1.363%
Change-Id: Iaec1cdcaacca8025e9babb0fb8a532fddb70c87d
Reviewed-on: https://go-review.googlesource.com/c/go/+/255239
Reviewed-by: eric fang <eric.fang@arm.com>
Reviewed-by: Keith Randall <khr@golang.org>
Trust: eric fang <eric.fang@arm.com>
2020-07-22 20:24:56 -06:00
|
|
|
// The following statement is to avoid conflict between the above check
|
|
|
|
// and the normal JMP generated at the end of the block.
|
|
|
|
d += e
|
2018-08-06 03:50:38 -06:00
|
|
|
}
|
|
|
|
return d, e
|
|
|
|
}
|
|
|
|
|
|
|
|
func NoFix64B(divd int64) (int64, int64) {
|
|
|
|
var d int64
|
|
|
|
var e int64
|
|
|
|
var divr int64 = -1
|
|
|
|
if divd > -9223372036854775808 {
|
|
|
|
d = divd / divr // amd64:-"JMP"
|
|
|
|
e = divd % divr // amd64:-"JMP"
|
cmd/compile: use depth first topological sort algorithm for layout
The current layout algorithm tries to put consecutive blocks together,
so the priority of the successor block is higher than the priority of
the zero indegree block. This algorithm is beneficial for subsequent
register allocation, but will result in more branch instructions.
The depth-first topological sorting algorithm is a well-known layout
algorithm, which has applications in many languages, and it helps to
reduce branch instructions. This CL applies it to the layout pass.
The test results show that it helps to reduce the code size.
This CL also includes the following changes:
1, Removed the primary predecessor mechanism. The new layout algorithm is
not very friendly to register allocator in some cases, in order to adapt
to the new layout algorithm, a new primary predecessor selection strategy
is introduced.
2, Since the new layout implementation may place non-loop blocks between
loop blocks, some adaptive modifications have also been made to looprotate
pass.
3, The layout also affects the results of codegen, so this CL also adjusted
several codegen tests accordingly.
It is inevitable that this CL will cause the code size or performance of a
few functions to decrease, but the number of cases it improves is much larger
than the number of cases it drops.
Statistical data from compilecmp on linux/amd64 is as follow:
name old time/op new time/op delta
Template 382ms ± 4% 382ms ± 4% ~ (p=0.497 n=49+50)
Unicode 170ms ± 9% 169ms ± 8% ~ (p=0.344 n=48+50)
GoTypes 2.01s ± 4% 2.01s ± 4% ~ (p=0.628 n=50+48)
Compiler 190ms ±10% 189ms ± 9% ~ (p=0.734 n=50+50)
SSA 11.8s ± 2% 11.8s ± 3% ~ (p=0.877 n=50+50)
Flate 241ms ± 9% 241ms ± 8% ~ (p=0.897 n=50+49)
GoParser 366ms ± 3% 361ms ± 4% -1.21% (p=0.004 n=47+50)
Reflect 835ms ± 3% 838ms ± 3% ~ (p=0.275 n=50+49)
Tar 336ms ± 4% 335ms ± 3% ~ (p=0.454 n=48+48)
XML 433ms ± 4% 431ms ± 3% ~ (p=0.071 n=49+48)
LinkCompiler 706ms ± 4% 705ms ± 4% ~ (p=0.608 n=50+49)
ExternalLinkCompiler 1.85s ± 3% 1.83s ± 2% -1.47% (p=0.000 n=49+48)
LinkWithoutDebugCompiler 437ms ± 5% 437ms ± 6% ~ (p=0.953 n=49+50)
[Geo mean] 615ms 613ms -0.37%
name old alloc/op new alloc/op delta
Template 38.7MB ± 1% 38.7MB ± 1% ~ (p=0.834 n=50+50)
Unicode 28.1MB ± 0% 28.1MB ± 0% -0.22% (p=0.000 n=49+50)
GoTypes 168MB ± 1% 168MB ± 1% ~ (p=0.054 n=47+47)
Compiler 23.0MB ± 1% 23.0MB ± 1% ~ (p=0.432 n=50+50)
SSA 1.54GB ± 0% 1.54GB ± 0% +0.21% (p=0.000 n=50+50)
Flate 23.6MB ± 1% 23.6MB ± 1% ~ (p=0.153 n=43+46)
GoParser 35.1MB ± 1% 35.1MB ± 2% ~ (p=0.202 n=50+50)
Reflect 84.7MB ± 1% 84.7MB ± 1% ~ (p=0.333 n=48+49)
Tar 34.5MB ± 1% 34.5MB ± 1% ~ (p=0.406 n=46+49)
XML 44.3MB ± 2% 44.2MB ± 3% ~ (p=0.981 n=50+50)
LinkCompiler 131MB ± 0% 128MB ± 0% -2.74% (p=0.000 n=50+50)
ExternalLinkCompiler 120MB ± 0% 120MB ± 0% +0.01% (p=0.007 n=50+50)
LinkWithoutDebugCompiler 77.3MB ± 0% 77.3MB ± 0% -0.02% (p=0.000 n=50+50)
[Geo mean] 69.3MB 69.1MB -0.22%
file before after Δ %
addr2line 4104220 4043684 -60536 -1.475%
api 5342502 5249678 -92824 -1.737%
asm 4973785 4858257 -115528 -2.323%
buildid 2667844 2625660 -42184 -1.581%
cgo 4686849 4616313 -70536 -1.505%
compile 23667431 23268406 -399025 -1.686%
cover 4959676 4874108 -85568 -1.725%
dist 3515934 3450422 -65512 -1.863%
doc 3995581 3925469 -70112 -1.755%
fix 3379202 3318522 -60680 -1.796%
link 6743249 6629913 -113336 -1.681%
nm 4047529 3991777 -55752 -1.377%
objdump 4456151 4388151 -68000 -1.526%
pack 2435040 2398072 -36968 -1.518%
pprof 13804080 13565808 -238272 -1.726%
test2json 2690043 2645987 -44056 -1.638%
trace 10418492 10232716 -185776 -1.783%
vet 7258259 7121259 -137000 -1.888%
total 113145867 111204202 -1941665 -1.716%
The situation on linux/arm64 is as follow:
name old time/op new time/op delta
Template 280ms ± 1% 282ms ± 1% +0.75% (p=0.000 n=46+48)
Unicode 124ms ± 2% 124ms ± 2% +0.37% (p=0.045 n=50+50)
GoTypes 1.69s ± 1% 1.70s ± 1% +0.56% (p=0.000 n=49+50)
Compiler 122ms ± 1% 123ms ± 1% +0.93% (p=0.000 n=50+50)
SSA 12.6s ± 1% 12.7s ± 0% +0.72% (p=0.000 n=50+50)
Flate 170ms ± 1% 172ms ± 1% +0.97% (p=0.000 n=49+49)
GoParser 262ms ± 1% 263ms ± 1% +0.39% (p=0.000 n=49+48)
Reflect 639ms ± 1% 650ms ± 1% +1.63% (p=0.000 n=49+49)
Tar 243ms ± 1% 245ms ± 1% +0.82% (p=0.000 n=50+50)
XML 324ms ± 1% 327ms ± 1% +0.72% (p=0.000 n=50+49)
LinkCompiler 597ms ± 1% 596ms ± 1% -0.27% (p=0.001 n=48+47)
ExternalLinkCompiler 1.90s ± 1% 1.88s ± 1% -1.00% (p=0.000 n=50+50)
LinkWithoutDebugCompiler 364ms ± 1% 363ms ± 1% ~ (p=0.220 n=49+50)
[Geo mean] 485ms 488ms +0.49%
name old alloc/op new alloc/op delta
Template 38.7MB ± 0% 38.8MB ± 1% ~ (p=0.093 n=43+49)
Unicode 28.4MB ± 0% 28.4MB ± 0% +0.03% (p=0.000 n=49+45)
GoTypes 169MB ± 1% 169MB ± 1% +0.23% (p=0.010 n=50+50)
Compiler 23.2MB ± 1% 23.2MB ± 1% +0.11% (p=0.000 n=40+44)
SSA 1.54GB ± 0% 1.55GB ± 0% +0.45% (p=0.000 n=47+49)
Flate 23.8MB ± 2% 23.8MB ± 1% ~ (p=0.543 n=50+50)
GoParser 35.3MB ± 1% 35.4MB ± 1% ~ (p=0.792 n=50+50)
Reflect 85.2MB ± 1% 85.2MB ± 0% ~ (p=0.055 n=50+47)
Tar 34.5MB ± 1% 34.5MB ± 1% +0.06% (p=0.015 n=50+50)
XML 43.8MB ± 2% 43.9MB ± 2% +0.19% (p=0.000 n=48+48)
LinkCompiler 137MB ± 0% 136MB ± 0% -0.92% (p=0.000 n=50+50)
ExternalLinkCompiler 127MB ± 0% 127MB ± 0% ~ (p=0.516 n=50+50)
LinkWithoutDebugCompiler 84.0MB ± 0% 84.0MB ± 0% ~ (p=0.057 n=50+50)
[Geo mean] 70.4MB 70.4MB +0.01%
file before after Δ %
addr2line 4021557 4002933 -18624 -0.463%
api 5127847 5028503 -99344 -1.937%
asm 5034716 4936836 -97880 -1.944%
buildid 2608118 2594094 -14024 -0.538%
cgo 4488592 4398320 -90272 -2.011%
compile 22501129 22213592 -287537 -1.278%
cover 4742301 4713573 -28728 -0.606%
dist 3388071 3365311 -22760 -0.672%
doc 3802250 3776082 -26168 -0.688%
fix 3306147 3216939 -89208 -2.698%
link 6404483 6363699 -40784 -0.637%
nm 3941026 3921930 -19096 -0.485%
objdump 4383330 4295122 -88208 -2.012%
pack 2404547 2389515 -15032 -0.625%
pprof 12996234 12856818 -139416 -1.073%
test2json 2668500 2586788 -81712 -3.062%
trace 9816276 9609580 -206696 -2.106%
vet 6900682 6787338 -113344 -1.643%
total 108535806 107056973 -1478833 -1.363%
Change-Id: Iaec1cdcaacca8025e9babb0fb8a532fddb70c87d
Reviewed-on: https://go-review.googlesource.com/c/go/+/255239
Reviewed-by: eric fang <eric.fang@arm.com>
Reviewed-by: Keith Randall <khr@golang.org>
Trust: eric fang <eric.fang@arm.com>
2020-07-22 20:24:56 -06:00
|
|
|
d += e
|
2018-08-06 03:50:38 -06:00
|
|
|
}
|
|
|
|
return d, e
|
|
|
|
}
|
|
|
|
|
|
|
|
func NoFix32A(divr int32) (int32, int32) {
|
|
|
|
var d int32 = 42
|
|
|
|
var e int32 = 84
|
|
|
|
if divr > 5 {
|
|
|
|
// amd64:-"JMP"
|
|
|
|
// 386:-"JMP"
|
|
|
|
d /= divr
|
|
|
|
// amd64:-"JMP"
|
|
|
|
// 386:-"JMP"
|
|
|
|
e %= divr
|
cmd/compile: use depth first topological sort algorithm for layout
The current layout algorithm tries to put consecutive blocks together,
so the priority of the successor block is higher than the priority of
the zero indegree block. This algorithm is beneficial for subsequent
register allocation, but will result in more branch instructions.
The depth-first topological sorting algorithm is a well-known layout
algorithm, which has applications in many languages, and it helps to
reduce branch instructions. This CL applies it to the layout pass.
The test results show that it helps to reduce the code size.
This CL also includes the following changes:
1, Removed the primary predecessor mechanism. The new layout algorithm is
not very friendly to register allocator in some cases, in order to adapt
to the new layout algorithm, a new primary predecessor selection strategy
is introduced.
2, Since the new layout implementation may place non-loop blocks between
loop blocks, some adaptive modifications have also been made to looprotate
pass.
3, The layout also affects the results of codegen, so this CL also adjusted
several codegen tests accordingly.
It is inevitable that this CL will cause the code size or performance of a
few functions to decrease, but the number of cases it improves is much larger
than the number of cases it drops.
Statistical data from compilecmp on linux/amd64 is as follow:
name old time/op new time/op delta
Template 382ms ± 4% 382ms ± 4% ~ (p=0.497 n=49+50)
Unicode 170ms ± 9% 169ms ± 8% ~ (p=0.344 n=48+50)
GoTypes 2.01s ± 4% 2.01s ± 4% ~ (p=0.628 n=50+48)
Compiler 190ms ±10% 189ms ± 9% ~ (p=0.734 n=50+50)
SSA 11.8s ± 2% 11.8s ± 3% ~ (p=0.877 n=50+50)
Flate 241ms ± 9% 241ms ± 8% ~ (p=0.897 n=50+49)
GoParser 366ms ± 3% 361ms ± 4% -1.21% (p=0.004 n=47+50)
Reflect 835ms ± 3% 838ms ± 3% ~ (p=0.275 n=50+49)
Tar 336ms ± 4% 335ms ± 3% ~ (p=0.454 n=48+48)
XML 433ms ± 4% 431ms ± 3% ~ (p=0.071 n=49+48)
LinkCompiler 706ms ± 4% 705ms ± 4% ~ (p=0.608 n=50+49)
ExternalLinkCompiler 1.85s ± 3% 1.83s ± 2% -1.47% (p=0.000 n=49+48)
LinkWithoutDebugCompiler 437ms ± 5% 437ms ± 6% ~ (p=0.953 n=49+50)
[Geo mean] 615ms 613ms -0.37%
name old alloc/op new alloc/op delta
Template 38.7MB ± 1% 38.7MB ± 1% ~ (p=0.834 n=50+50)
Unicode 28.1MB ± 0% 28.1MB ± 0% -0.22% (p=0.000 n=49+50)
GoTypes 168MB ± 1% 168MB ± 1% ~ (p=0.054 n=47+47)
Compiler 23.0MB ± 1% 23.0MB ± 1% ~ (p=0.432 n=50+50)
SSA 1.54GB ± 0% 1.54GB ± 0% +0.21% (p=0.000 n=50+50)
Flate 23.6MB ± 1% 23.6MB ± 1% ~ (p=0.153 n=43+46)
GoParser 35.1MB ± 1% 35.1MB ± 2% ~ (p=0.202 n=50+50)
Reflect 84.7MB ± 1% 84.7MB ± 1% ~ (p=0.333 n=48+49)
Tar 34.5MB ± 1% 34.5MB ± 1% ~ (p=0.406 n=46+49)
XML 44.3MB ± 2% 44.2MB ± 3% ~ (p=0.981 n=50+50)
LinkCompiler 131MB ± 0% 128MB ± 0% -2.74% (p=0.000 n=50+50)
ExternalLinkCompiler 120MB ± 0% 120MB ± 0% +0.01% (p=0.007 n=50+50)
LinkWithoutDebugCompiler 77.3MB ± 0% 77.3MB ± 0% -0.02% (p=0.000 n=50+50)
[Geo mean] 69.3MB 69.1MB -0.22%
file before after Δ %
addr2line 4104220 4043684 -60536 -1.475%
api 5342502 5249678 -92824 -1.737%
asm 4973785 4858257 -115528 -2.323%
buildid 2667844 2625660 -42184 -1.581%
cgo 4686849 4616313 -70536 -1.505%
compile 23667431 23268406 -399025 -1.686%
cover 4959676 4874108 -85568 -1.725%
dist 3515934 3450422 -65512 -1.863%
doc 3995581 3925469 -70112 -1.755%
fix 3379202 3318522 -60680 -1.796%
link 6743249 6629913 -113336 -1.681%
nm 4047529 3991777 -55752 -1.377%
objdump 4456151 4388151 -68000 -1.526%
pack 2435040 2398072 -36968 -1.518%
pprof 13804080 13565808 -238272 -1.726%
test2json 2690043 2645987 -44056 -1.638%
trace 10418492 10232716 -185776 -1.783%
vet 7258259 7121259 -137000 -1.888%
total 113145867 111204202 -1941665 -1.716%
The situation on linux/arm64 is as follow:
name old time/op new time/op delta
Template 280ms ± 1% 282ms ± 1% +0.75% (p=0.000 n=46+48)
Unicode 124ms ± 2% 124ms ± 2% +0.37% (p=0.045 n=50+50)
GoTypes 1.69s ± 1% 1.70s ± 1% +0.56% (p=0.000 n=49+50)
Compiler 122ms ± 1% 123ms ± 1% +0.93% (p=0.000 n=50+50)
SSA 12.6s ± 1% 12.7s ± 0% +0.72% (p=0.000 n=50+50)
Flate 170ms ± 1% 172ms ± 1% +0.97% (p=0.000 n=49+49)
GoParser 262ms ± 1% 263ms ± 1% +0.39% (p=0.000 n=49+48)
Reflect 639ms ± 1% 650ms ± 1% +1.63% (p=0.000 n=49+49)
Tar 243ms ± 1% 245ms ± 1% +0.82% (p=0.000 n=50+50)
XML 324ms ± 1% 327ms ± 1% +0.72% (p=0.000 n=50+49)
LinkCompiler 597ms ± 1% 596ms ± 1% -0.27% (p=0.001 n=48+47)
ExternalLinkCompiler 1.90s ± 1% 1.88s ± 1% -1.00% (p=0.000 n=50+50)
LinkWithoutDebugCompiler 364ms ± 1% 363ms ± 1% ~ (p=0.220 n=49+50)
[Geo mean] 485ms 488ms +0.49%
name old alloc/op new alloc/op delta
Template 38.7MB ± 0% 38.8MB ± 1% ~ (p=0.093 n=43+49)
Unicode 28.4MB ± 0% 28.4MB ± 0% +0.03% (p=0.000 n=49+45)
GoTypes 169MB ± 1% 169MB ± 1% +0.23% (p=0.010 n=50+50)
Compiler 23.2MB ± 1% 23.2MB ± 1% +0.11% (p=0.000 n=40+44)
SSA 1.54GB ± 0% 1.55GB ± 0% +0.45% (p=0.000 n=47+49)
Flate 23.8MB ± 2% 23.8MB ± 1% ~ (p=0.543 n=50+50)
GoParser 35.3MB ± 1% 35.4MB ± 1% ~ (p=0.792 n=50+50)
Reflect 85.2MB ± 1% 85.2MB ± 0% ~ (p=0.055 n=50+47)
Tar 34.5MB ± 1% 34.5MB ± 1% +0.06% (p=0.015 n=50+50)
XML 43.8MB ± 2% 43.9MB ± 2% +0.19% (p=0.000 n=48+48)
LinkCompiler 137MB ± 0% 136MB ± 0% -0.92% (p=0.000 n=50+50)
ExternalLinkCompiler 127MB ± 0% 127MB ± 0% ~ (p=0.516 n=50+50)
LinkWithoutDebugCompiler 84.0MB ± 0% 84.0MB ± 0% ~ (p=0.057 n=50+50)
[Geo mean] 70.4MB 70.4MB +0.01%
file before after Δ %
addr2line 4021557 4002933 -18624 -0.463%
api 5127847 5028503 -99344 -1.937%
asm 5034716 4936836 -97880 -1.944%
buildid 2608118 2594094 -14024 -0.538%
cgo 4488592 4398320 -90272 -2.011%
compile 22501129 22213592 -287537 -1.278%
cover 4742301 4713573 -28728 -0.606%
dist 3388071 3365311 -22760 -0.672%
doc 3802250 3776082 -26168 -0.688%
fix 3306147 3216939 -89208 -2.698%
link 6404483 6363699 -40784 -0.637%
nm 3941026 3921930 -19096 -0.485%
objdump 4383330 4295122 -88208 -2.012%
pack 2404547 2389515 -15032 -0.625%
pprof 12996234 12856818 -139416 -1.073%
test2json 2668500 2586788 -81712 -3.062%
trace 9816276 9609580 -206696 -2.106%
vet 6900682 6787338 -113344 -1.643%
total 108535806 107056973 -1478833 -1.363%
Change-Id: Iaec1cdcaacca8025e9babb0fb8a532fddb70c87d
Reviewed-on: https://go-review.googlesource.com/c/go/+/255239
Reviewed-by: eric fang <eric.fang@arm.com>
Reviewed-by: Keith Randall <khr@golang.org>
Trust: eric fang <eric.fang@arm.com>
2020-07-22 20:24:56 -06:00
|
|
|
d += e
|
2018-08-06 03:50:38 -06:00
|
|
|
}
|
|
|
|
return d, e
|
|
|
|
}
|
|
|
|
|
|
|
|
func NoFix32B(divd int32) (int32, int32) {
|
|
|
|
var d int32
|
|
|
|
var e int32
|
|
|
|
var divr int32 = -1
|
|
|
|
if divd > -2147483648 {
|
|
|
|
// amd64:-"JMP"
|
|
|
|
// 386:-"JMP"
|
|
|
|
d = divd / divr
|
|
|
|
// amd64:-"JMP"
|
|
|
|
// 386:-"JMP"
|
|
|
|
e = divd % divr
|
cmd/compile: use depth first topological sort algorithm for layout
The current layout algorithm tries to put consecutive blocks together,
so the priority of the successor block is higher than the priority of
the zero indegree block. This algorithm is beneficial for subsequent
register allocation, but will result in more branch instructions.
The depth-first topological sorting algorithm is a well-known layout
algorithm, which has applications in many languages, and it helps to
reduce branch instructions. This CL applies it to the layout pass.
The test results show that it helps to reduce the code size.
This CL also includes the following changes:
1, Removed the primary predecessor mechanism. The new layout algorithm is
not very friendly to register allocator in some cases, in order to adapt
to the new layout algorithm, a new primary predecessor selection strategy
is introduced.
2, Since the new layout implementation may place non-loop blocks between
loop blocks, some adaptive modifications have also been made to looprotate
pass.
3, The layout also affects the results of codegen, so this CL also adjusted
several codegen tests accordingly.
It is inevitable that this CL will cause the code size or performance of a
few functions to decrease, but the number of cases it improves is much larger
than the number of cases it drops.
Statistical data from compilecmp on linux/amd64 is as follow:
name old time/op new time/op delta
Template 382ms ± 4% 382ms ± 4% ~ (p=0.497 n=49+50)
Unicode 170ms ± 9% 169ms ± 8% ~ (p=0.344 n=48+50)
GoTypes 2.01s ± 4% 2.01s ± 4% ~ (p=0.628 n=50+48)
Compiler 190ms ±10% 189ms ± 9% ~ (p=0.734 n=50+50)
SSA 11.8s ± 2% 11.8s ± 3% ~ (p=0.877 n=50+50)
Flate 241ms ± 9% 241ms ± 8% ~ (p=0.897 n=50+49)
GoParser 366ms ± 3% 361ms ± 4% -1.21% (p=0.004 n=47+50)
Reflect 835ms ± 3% 838ms ± 3% ~ (p=0.275 n=50+49)
Tar 336ms ± 4% 335ms ± 3% ~ (p=0.454 n=48+48)
XML 433ms ± 4% 431ms ± 3% ~ (p=0.071 n=49+48)
LinkCompiler 706ms ± 4% 705ms ± 4% ~ (p=0.608 n=50+49)
ExternalLinkCompiler 1.85s ± 3% 1.83s ± 2% -1.47% (p=0.000 n=49+48)
LinkWithoutDebugCompiler 437ms ± 5% 437ms ± 6% ~ (p=0.953 n=49+50)
[Geo mean] 615ms 613ms -0.37%
name old alloc/op new alloc/op delta
Template 38.7MB ± 1% 38.7MB ± 1% ~ (p=0.834 n=50+50)
Unicode 28.1MB ± 0% 28.1MB ± 0% -0.22% (p=0.000 n=49+50)
GoTypes 168MB ± 1% 168MB ± 1% ~ (p=0.054 n=47+47)
Compiler 23.0MB ± 1% 23.0MB ± 1% ~ (p=0.432 n=50+50)
SSA 1.54GB ± 0% 1.54GB ± 0% +0.21% (p=0.000 n=50+50)
Flate 23.6MB ± 1% 23.6MB ± 1% ~ (p=0.153 n=43+46)
GoParser 35.1MB ± 1% 35.1MB ± 2% ~ (p=0.202 n=50+50)
Reflect 84.7MB ± 1% 84.7MB ± 1% ~ (p=0.333 n=48+49)
Tar 34.5MB ± 1% 34.5MB ± 1% ~ (p=0.406 n=46+49)
XML 44.3MB ± 2% 44.2MB ± 3% ~ (p=0.981 n=50+50)
LinkCompiler 131MB ± 0% 128MB ± 0% -2.74% (p=0.000 n=50+50)
ExternalLinkCompiler 120MB ± 0% 120MB ± 0% +0.01% (p=0.007 n=50+50)
LinkWithoutDebugCompiler 77.3MB ± 0% 77.3MB ± 0% -0.02% (p=0.000 n=50+50)
[Geo mean] 69.3MB 69.1MB -0.22%
file before after Δ %
addr2line 4104220 4043684 -60536 -1.475%
api 5342502 5249678 -92824 -1.737%
asm 4973785 4858257 -115528 -2.323%
buildid 2667844 2625660 -42184 -1.581%
cgo 4686849 4616313 -70536 -1.505%
compile 23667431 23268406 -399025 -1.686%
cover 4959676 4874108 -85568 -1.725%
dist 3515934 3450422 -65512 -1.863%
doc 3995581 3925469 -70112 -1.755%
fix 3379202 3318522 -60680 -1.796%
link 6743249 6629913 -113336 -1.681%
nm 4047529 3991777 -55752 -1.377%
objdump 4456151 4388151 -68000 -1.526%
pack 2435040 2398072 -36968 -1.518%
pprof 13804080 13565808 -238272 -1.726%
test2json 2690043 2645987 -44056 -1.638%
trace 10418492 10232716 -185776 -1.783%
vet 7258259 7121259 -137000 -1.888%
total 113145867 111204202 -1941665 -1.716%
The situation on linux/arm64 is as follow:
name old time/op new time/op delta
Template 280ms ± 1% 282ms ± 1% +0.75% (p=0.000 n=46+48)
Unicode 124ms ± 2% 124ms ± 2% +0.37% (p=0.045 n=50+50)
GoTypes 1.69s ± 1% 1.70s ± 1% +0.56% (p=0.000 n=49+50)
Compiler 122ms ± 1% 123ms ± 1% +0.93% (p=0.000 n=50+50)
SSA 12.6s ± 1% 12.7s ± 0% +0.72% (p=0.000 n=50+50)
Flate 170ms ± 1% 172ms ± 1% +0.97% (p=0.000 n=49+49)
GoParser 262ms ± 1% 263ms ± 1% +0.39% (p=0.000 n=49+48)
Reflect 639ms ± 1% 650ms ± 1% +1.63% (p=0.000 n=49+49)
Tar 243ms ± 1% 245ms ± 1% +0.82% (p=0.000 n=50+50)
XML 324ms ± 1% 327ms ± 1% +0.72% (p=0.000 n=50+49)
LinkCompiler 597ms ± 1% 596ms ± 1% -0.27% (p=0.001 n=48+47)
ExternalLinkCompiler 1.90s ± 1% 1.88s ± 1% -1.00% (p=0.000 n=50+50)
LinkWithoutDebugCompiler 364ms ± 1% 363ms ± 1% ~ (p=0.220 n=49+50)
[Geo mean] 485ms 488ms +0.49%
name old alloc/op new alloc/op delta
Template 38.7MB ± 0% 38.8MB ± 1% ~ (p=0.093 n=43+49)
Unicode 28.4MB ± 0% 28.4MB ± 0% +0.03% (p=0.000 n=49+45)
GoTypes 169MB ± 1% 169MB ± 1% +0.23% (p=0.010 n=50+50)
Compiler 23.2MB ± 1% 23.2MB ± 1% +0.11% (p=0.000 n=40+44)
SSA 1.54GB ± 0% 1.55GB ± 0% +0.45% (p=0.000 n=47+49)
Flate 23.8MB ± 2% 23.8MB ± 1% ~ (p=0.543 n=50+50)
GoParser 35.3MB ± 1% 35.4MB ± 1% ~ (p=0.792 n=50+50)
Reflect 85.2MB ± 1% 85.2MB ± 0% ~ (p=0.055 n=50+47)
Tar 34.5MB ± 1% 34.5MB ± 1% +0.06% (p=0.015 n=50+50)
XML 43.8MB ± 2% 43.9MB ± 2% +0.19% (p=0.000 n=48+48)
LinkCompiler 137MB ± 0% 136MB ± 0% -0.92% (p=0.000 n=50+50)
ExternalLinkCompiler 127MB ± 0% 127MB ± 0% ~ (p=0.516 n=50+50)
LinkWithoutDebugCompiler 84.0MB ± 0% 84.0MB ± 0% ~ (p=0.057 n=50+50)
[Geo mean] 70.4MB 70.4MB +0.01%
file before after Δ %
addr2line 4021557 4002933 -18624 -0.463%
api 5127847 5028503 -99344 -1.937%
asm 5034716 4936836 -97880 -1.944%
buildid 2608118 2594094 -14024 -0.538%
cgo 4488592 4398320 -90272 -2.011%
compile 22501129 22213592 -287537 -1.278%
cover 4742301 4713573 -28728 -0.606%
dist 3388071 3365311 -22760 -0.672%
doc 3802250 3776082 -26168 -0.688%
fix 3306147 3216939 -89208 -2.698%
link 6404483 6363699 -40784 -0.637%
nm 3941026 3921930 -19096 -0.485%
objdump 4383330 4295122 -88208 -2.012%
pack 2404547 2389515 -15032 -0.625%
pprof 12996234 12856818 -139416 -1.073%
test2json 2668500 2586788 -81712 -3.062%
trace 9816276 9609580 -206696 -2.106%
vet 6900682 6787338 -113344 -1.643%
total 108535806 107056973 -1478833 -1.363%
Change-Id: Iaec1cdcaacca8025e9babb0fb8a532fddb70c87d
Reviewed-on: https://go-review.googlesource.com/c/go/+/255239
Reviewed-by: eric fang <eric.fang@arm.com>
Reviewed-by: Keith Randall <khr@golang.org>
Trust: eric fang <eric.fang@arm.com>
2020-07-22 20:24:56 -06:00
|
|
|
d += e
|
2018-08-06 03:50:38 -06:00
|
|
|
}
|
|
|
|
return d, e
|
|
|
|
}
|
|
|
|
|
|
|
|
func NoFix16A(divr int16) (int16, int16) {
|
|
|
|
var d int16 = 42
|
|
|
|
var e int16 = 84
|
|
|
|
if divr > 5 {
|
|
|
|
// amd64:-"JMP"
|
|
|
|
// 386:-"JMP"
|
|
|
|
d /= divr
|
|
|
|
// amd64:-"JMP"
|
|
|
|
// 386:-"JMP"
|
|
|
|
e %= divr
|
cmd/compile: use depth first topological sort algorithm for layout
The current layout algorithm tries to put consecutive blocks together,
so the priority of the successor block is higher than the priority of
the zero indegree block. This algorithm is beneficial for subsequent
register allocation, but will result in more branch instructions.
The depth-first topological sorting algorithm is a well-known layout
algorithm, which has applications in many languages, and it helps to
reduce branch instructions. This CL applies it to the layout pass.
The test results show that it helps to reduce the code size.
This CL also includes the following changes:
1, Removed the primary predecessor mechanism. The new layout algorithm is
not very friendly to register allocator in some cases, in order to adapt
to the new layout algorithm, a new primary predecessor selection strategy
is introduced.
2, Since the new layout implementation may place non-loop blocks between
loop blocks, some adaptive modifications have also been made to looprotate
pass.
3, The layout also affects the results of codegen, so this CL also adjusted
several codegen tests accordingly.
It is inevitable that this CL will cause the code size or performance of a
few functions to decrease, but the number of cases it improves is much larger
than the number of cases it drops.
Statistical data from compilecmp on linux/amd64 is as follow:
name old time/op new time/op delta
Template 382ms ± 4% 382ms ± 4% ~ (p=0.497 n=49+50)
Unicode 170ms ± 9% 169ms ± 8% ~ (p=0.344 n=48+50)
GoTypes 2.01s ± 4% 2.01s ± 4% ~ (p=0.628 n=50+48)
Compiler 190ms ±10% 189ms ± 9% ~ (p=0.734 n=50+50)
SSA 11.8s ± 2% 11.8s ± 3% ~ (p=0.877 n=50+50)
Flate 241ms ± 9% 241ms ± 8% ~ (p=0.897 n=50+49)
GoParser 366ms ± 3% 361ms ± 4% -1.21% (p=0.004 n=47+50)
Reflect 835ms ± 3% 838ms ± 3% ~ (p=0.275 n=50+49)
Tar 336ms ± 4% 335ms ± 3% ~ (p=0.454 n=48+48)
XML 433ms ± 4% 431ms ± 3% ~ (p=0.071 n=49+48)
LinkCompiler 706ms ± 4% 705ms ± 4% ~ (p=0.608 n=50+49)
ExternalLinkCompiler 1.85s ± 3% 1.83s ± 2% -1.47% (p=0.000 n=49+48)
LinkWithoutDebugCompiler 437ms ± 5% 437ms ± 6% ~ (p=0.953 n=49+50)
[Geo mean] 615ms 613ms -0.37%
name old alloc/op new alloc/op delta
Template 38.7MB ± 1% 38.7MB ± 1% ~ (p=0.834 n=50+50)
Unicode 28.1MB ± 0% 28.1MB ± 0% -0.22% (p=0.000 n=49+50)
GoTypes 168MB ± 1% 168MB ± 1% ~ (p=0.054 n=47+47)
Compiler 23.0MB ± 1% 23.0MB ± 1% ~ (p=0.432 n=50+50)
SSA 1.54GB ± 0% 1.54GB ± 0% +0.21% (p=0.000 n=50+50)
Flate 23.6MB ± 1% 23.6MB ± 1% ~ (p=0.153 n=43+46)
GoParser 35.1MB ± 1% 35.1MB ± 2% ~ (p=0.202 n=50+50)
Reflect 84.7MB ± 1% 84.7MB ± 1% ~ (p=0.333 n=48+49)
Tar 34.5MB ± 1% 34.5MB ± 1% ~ (p=0.406 n=46+49)
XML 44.3MB ± 2% 44.2MB ± 3% ~ (p=0.981 n=50+50)
LinkCompiler 131MB ± 0% 128MB ± 0% -2.74% (p=0.000 n=50+50)
ExternalLinkCompiler 120MB ± 0% 120MB ± 0% +0.01% (p=0.007 n=50+50)
LinkWithoutDebugCompiler 77.3MB ± 0% 77.3MB ± 0% -0.02% (p=0.000 n=50+50)
[Geo mean] 69.3MB 69.1MB -0.22%
file before after Δ %
addr2line 4104220 4043684 -60536 -1.475%
api 5342502 5249678 -92824 -1.737%
asm 4973785 4858257 -115528 -2.323%
buildid 2667844 2625660 -42184 -1.581%
cgo 4686849 4616313 -70536 -1.505%
compile 23667431 23268406 -399025 -1.686%
cover 4959676 4874108 -85568 -1.725%
dist 3515934 3450422 -65512 -1.863%
doc 3995581 3925469 -70112 -1.755%
fix 3379202 3318522 -60680 -1.796%
link 6743249 6629913 -113336 -1.681%
nm 4047529 3991777 -55752 -1.377%
objdump 4456151 4388151 -68000 -1.526%
pack 2435040 2398072 -36968 -1.518%
pprof 13804080 13565808 -238272 -1.726%
test2json 2690043 2645987 -44056 -1.638%
trace 10418492 10232716 -185776 -1.783%
vet 7258259 7121259 -137000 -1.888%
total 113145867 111204202 -1941665 -1.716%
The situation on linux/arm64 is as follow:
name old time/op new time/op delta
Template 280ms ± 1% 282ms ± 1% +0.75% (p=0.000 n=46+48)
Unicode 124ms ± 2% 124ms ± 2% +0.37% (p=0.045 n=50+50)
GoTypes 1.69s ± 1% 1.70s ± 1% +0.56% (p=0.000 n=49+50)
Compiler 122ms ± 1% 123ms ± 1% +0.93% (p=0.000 n=50+50)
SSA 12.6s ± 1% 12.7s ± 0% +0.72% (p=0.000 n=50+50)
Flate 170ms ± 1% 172ms ± 1% +0.97% (p=0.000 n=49+49)
GoParser 262ms ± 1% 263ms ± 1% +0.39% (p=0.000 n=49+48)
Reflect 639ms ± 1% 650ms ± 1% +1.63% (p=0.000 n=49+49)
Tar 243ms ± 1% 245ms ± 1% +0.82% (p=0.000 n=50+50)
XML 324ms ± 1% 327ms ± 1% +0.72% (p=0.000 n=50+49)
LinkCompiler 597ms ± 1% 596ms ± 1% -0.27% (p=0.001 n=48+47)
ExternalLinkCompiler 1.90s ± 1% 1.88s ± 1% -1.00% (p=0.000 n=50+50)
LinkWithoutDebugCompiler 364ms ± 1% 363ms ± 1% ~ (p=0.220 n=49+50)
[Geo mean] 485ms 488ms +0.49%
name old alloc/op new alloc/op delta
Template 38.7MB ± 0% 38.8MB ± 1% ~ (p=0.093 n=43+49)
Unicode 28.4MB ± 0% 28.4MB ± 0% +0.03% (p=0.000 n=49+45)
GoTypes 169MB ± 1% 169MB ± 1% +0.23% (p=0.010 n=50+50)
Compiler 23.2MB ± 1% 23.2MB ± 1% +0.11% (p=0.000 n=40+44)
SSA 1.54GB ± 0% 1.55GB ± 0% +0.45% (p=0.000 n=47+49)
Flate 23.8MB ± 2% 23.8MB ± 1% ~ (p=0.543 n=50+50)
GoParser 35.3MB ± 1% 35.4MB ± 1% ~ (p=0.792 n=50+50)
Reflect 85.2MB ± 1% 85.2MB ± 0% ~ (p=0.055 n=50+47)
Tar 34.5MB ± 1% 34.5MB ± 1% +0.06% (p=0.015 n=50+50)
XML 43.8MB ± 2% 43.9MB ± 2% +0.19% (p=0.000 n=48+48)
LinkCompiler 137MB ± 0% 136MB ± 0% -0.92% (p=0.000 n=50+50)
ExternalLinkCompiler 127MB ± 0% 127MB ± 0% ~ (p=0.516 n=50+50)
LinkWithoutDebugCompiler 84.0MB ± 0% 84.0MB ± 0% ~ (p=0.057 n=50+50)
[Geo mean] 70.4MB 70.4MB +0.01%
file before after Δ %
addr2line 4021557 4002933 -18624 -0.463%
api 5127847 5028503 -99344 -1.937%
asm 5034716 4936836 -97880 -1.944%
buildid 2608118 2594094 -14024 -0.538%
cgo 4488592 4398320 -90272 -2.011%
compile 22501129 22213592 -287537 -1.278%
cover 4742301 4713573 -28728 -0.606%
dist 3388071 3365311 -22760 -0.672%
doc 3802250 3776082 -26168 -0.688%
fix 3306147 3216939 -89208 -2.698%
link 6404483 6363699 -40784 -0.637%
nm 3941026 3921930 -19096 -0.485%
objdump 4383330 4295122 -88208 -2.012%
pack 2404547 2389515 -15032 -0.625%
pprof 12996234 12856818 -139416 -1.073%
test2json 2668500 2586788 -81712 -3.062%
trace 9816276 9609580 -206696 -2.106%
vet 6900682 6787338 -113344 -1.643%
total 108535806 107056973 -1478833 -1.363%
Change-Id: Iaec1cdcaacca8025e9babb0fb8a532fddb70c87d
Reviewed-on: https://go-review.googlesource.com/c/go/+/255239
Reviewed-by: eric fang <eric.fang@arm.com>
Reviewed-by: Keith Randall <khr@golang.org>
Trust: eric fang <eric.fang@arm.com>
2020-07-22 20:24:56 -06:00
|
|
|
d += e
|
2018-08-06 03:50:38 -06:00
|
|
|
}
|
|
|
|
return d, e
|
|
|
|
}
|
|
|
|
|
|
|
|
func NoFix16B(divd int16) (int16, int16) {
|
|
|
|
var d int16
|
|
|
|
var e int16
|
|
|
|
var divr int16 = -1
|
|
|
|
if divd > -32768 {
|
|
|
|
// amd64:-"JMP"
|
|
|
|
// 386:-"JMP"
|
|
|
|
d = divd / divr
|
|
|
|
// amd64:-"JMP"
|
|
|
|
// 386:-"JMP"
|
|
|
|
e = divd % divr
|
cmd/compile: use depth first topological sort algorithm for layout
The current layout algorithm tries to put consecutive blocks together,
so the priority of the successor block is higher than the priority of
the zero indegree block. This algorithm is beneficial for subsequent
register allocation, but will result in more branch instructions.
The depth-first topological sorting algorithm is a well-known layout
algorithm, which has applications in many languages, and it helps to
reduce branch instructions. This CL applies it to the layout pass.
The test results show that it helps to reduce the code size.
This CL also includes the following changes:
1, Removed the primary predecessor mechanism. The new layout algorithm is
not very friendly to register allocator in some cases, in order to adapt
to the new layout algorithm, a new primary predecessor selection strategy
is introduced.
2, Since the new layout implementation may place non-loop blocks between
loop blocks, some adaptive modifications have also been made to looprotate
pass.
3, The layout also affects the results of codegen, so this CL also adjusted
several codegen tests accordingly.
It is inevitable that this CL will cause the code size or performance of a
few functions to decrease, but the number of cases it improves is much larger
than the number of cases it drops.
Statistical data from compilecmp on linux/amd64 is as follow:
name old time/op new time/op delta
Template 382ms ± 4% 382ms ± 4% ~ (p=0.497 n=49+50)
Unicode 170ms ± 9% 169ms ± 8% ~ (p=0.344 n=48+50)
GoTypes 2.01s ± 4% 2.01s ± 4% ~ (p=0.628 n=50+48)
Compiler 190ms ±10% 189ms ± 9% ~ (p=0.734 n=50+50)
SSA 11.8s ± 2% 11.8s ± 3% ~ (p=0.877 n=50+50)
Flate 241ms ± 9% 241ms ± 8% ~ (p=0.897 n=50+49)
GoParser 366ms ± 3% 361ms ± 4% -1.21% (p=0.004 n=47+50)
Reflect 835ms ± 3% 838ms ± 3% ~ (p=0.275 n=50+49)
Tar 336ms ± 4% 335ms ± 3% ~ (p=0.454 n=48+48)
XML 433ms ± 4% 431ms ± 3% ~ (p=0.071 n=49+48)
LinkCompiler 706ms ± 4% 705ms ± 4% ~ (p=0.608 n=50+49)
ExternalLinkCompiler 1.85s ± 3% 1.83s ± 2% -1.47% (p=0.000 n=49+48)
LinkWithoutDebugCompiler 437ms ± 5% 437ms ± 6% ~ (p=0.953 n=49+50)
[Geo mean] 615ms 613ms -0.37%
name old alloc/op new alloc/op delta
Template 38.7MB ± 1% 38.7MB ± 1% ~ (p=0.834 n=50+50)
Unicode 28.1MB ± 0% 28.1MB ± 0% -0.22% (p=0.000 n=49+50)
GoTypes 168MB ± 1% 168MB ± 1% ~ (p=0.054 n=47+47)
Compiler 23.0MB ± 1% 23.0MB ± 1% ~ (p=0.432 n=50+50)
SSA 1.54GB ± 0% 1.54GB ± 0% +0.21% (p=0.000 n=50+50)
Flate 23.6MB ± 1% 23.6MB ± 1% ~ (p=0.153 n=43+46)
GoParser 35.1MB ± 1% 35.1MB ± 2% ~ (p=0.202 n=50+50)
Reflect 84.7MB ± 1% 84.7MB ± 1% ~ (p=0.333 n=48+49)
Tar 34.5MB ± 1% 34.5MB ± 1% ~ (p=0.406 n=46+49)
XML 44.3MB ± 2% 44.2MB ± 3% ~ (p=0.981 n=50+50)
LinkCompiler 131MB ± 0% 128MB ± 0% -2.74% (p=0.000 n=50+50)
ExternalLinkCompiler 120MB ± 0% 120MB ± 0% +0.01% (p=0.007 n=50+50)
LinkWithoutDebugCompiler 77.3MB ± 0% 77.3MB ± 0% -0.02% (p=0.000 n=50+50)
[Geo mean] 69.3MB 69.1MB -0.22%
file before after Δ %
addr2line 4104220 4043684 -60536 -1.475%
api 5342502 5249678 -92824 -1.737%
asm 4973785 4858257 -115528 -2.323%
buildid 2667844 2625660 -42184 -1.581%
cgo 4686849 4616313 -70536 -1.505%
compile 23667431 23268406 -399025 -1.686%
cover 4959676 4874108 -85568 -1.725%
dist 3515934 3450422 -65512 -1.863%
doc 3995581 3925469 -70112 -1.755%
fix 3379202 3318522 -60680 -1.796%
link 6743249 6629913 -113336 -1.681%
nm 4047529 3991777 -55752 -1.377%
objdump 4456151 4388151 -68000 -1.526%
pack 2435040 2398072 -36968 -1.518%
pprof 13804080 13565808 -238272 -1.726%
test2json 2690043 2645987 -44056 -1.638%
trace 10418492 10232716 -185776 -1.783%
vet 7258259 7121259 -137000 -1.888%
total 113145867 111204202 -1941665 -1.716%
The situation on linux/arm64 is as follow:
name old time/op new time/op delta
Template 280ms ± 1% 282ms ± 1% +0.75% (p=0.000 n=46+48)
Unicode 124ms ± 2% 124ms ± 2% +0.37% (p=0.045 n=50+50)
GoTypes 1.69s ± 1% 1.70s ± 1% +0.56% (p=0.000 n=49+50)
Compiler 122ms ± 1% 123ms ± 1% +0.93% (p=0.000 n=50+50)
SSA 12.6s ± 1% 12.7s ± 0% +0.72% (p=0.000 n=50+50)
Flate 170ms ± 1% 172ms ± 1% +0.97% (p=0.000 n=49+49)
GoParser 262ms ± 1% 263ms ± 1% +0.39% (p=0.000 n=49+48)
Reflect 639ms ± 1% 650ms ± 1% +1.63% (p=0.000 n=49+49)
Tar 243ms ± 1% 245ms ± 1% +0.82% (p=0.000 n=50+50)
XML 324ms ± 1% 327ms ± 1% +0.72% (p=0.000 n=50+49)
LinkCompiler 597ms ± 1% 596ms ± 1% -0.27% (p=0.001 n=48+47)
ExternalLinkCompiler 1.90s ± 1% 1.88s ± 1% -1.00% (p=0.000 n=50+50)
LinkWithoutDebugCompiler 364ms ± 1% 363ms ± 1% ~ (p=0.220 n=49+50)
[Geo mean] 485ms 488ms +0.49%
name old alloc/op new alloc/op delta
Template 38.7MB ± 0% 38.8MB ± 1% ~ (p=0.093 n=43+49)
Unicode 28.4MB ± 0% 28.4MB ± 0% +0.03% (p=0.000 n=49+45)
GoTypes 169MB ± 1% 169MB ± 1% +0.23% (p=0.010 n=50+50)
Compiler 23.2MB ± 1% 23.2MB ± 1% +0.11% (p=0.000 n=40+44)
SSA 1.54GB ± 0% 1.55GB ± 0% +0.45% (p=0.000 n=47+49)
Flate 23.8MB ± 2% 23.8MB ± 1% ~ (p=0.543 n=50+50)
GoParser 35.3MB ± 1% 35.4MB ± 1% ~ (p=0.792 n=50+50)
Reflect 85.2MB ± 1% 85.2MB ± 0% ~ (p=0.055 n=50+47)
Tar 34.5MB ± 1% 34.5MB ± 1% +0.06% (p=0.015 n=50+50)
XML 43.8MB ± 2% 43.9MB ± 2% +0.19% (p=0.000 n=48+48)
LinkCompiler 137MB ± 0% 136MB ± 0% -0.92% (p=0.000 n=50+50)
ExternalLinkCompiler 127MB ± 0% 127MB ± 0% ~ (p=0.516 n=50+50)
LinkWithoutDebugCompiler 84.0MB ± 0% 84.0MB ± 0% ~ (p=0.057 n=50+50)
[Geo mean] 70.4MB 70.4MB +0.01%
file before after Δ %
addr2line 4021557 4002933 -18624 -0.463%
api 5127847 5028503 -99344 -1.937%
asm 5034716 4936836 -97880 -1.944%
buildid 2608118 2594094 -14024 -0.538%
cgo 4488592 4398320 -90272 -2.011%
compile 22501129 22213592 -287537 -1.278%
cover 4742301 4713573 -28728 -0.606%
dist 3388071 3365311 -22760 -0.672%
doc 3802250 3776082 -26168 -0.688%
fix 3306147 3216939 -89208 -2.698%
link 6404483 6363699 -40784 -0.637%
nm 3941026 3921930 -19096 -0.485%
objdump 4383330 4295122 -88208 -2.012%
pack 2404547 2389515 -15032 -0.625%
pprof 12996234 12856818 -139416 -1.073%
test2json 2668500 2586788 -81712 -3.062%
trace 9816276 9609580 -206696 -2.106%
vet 6900682 6787338 -113344 -1.643%
total 108535806 107056973 -1478833 -1.363%
Change-Id: Iaec1cdcaacca8025e9babb0fb8a532fddb70c87d
Reviewed-on: https://go-review.googlesource.com/c/go/+/255239
Reviewed-by: eric fang <eric.fang@arm.com>
Reviewed-by: Keith Randall <khr@golang.org>
Trust: eric fang <eric.fang@arm.com>
2020-07-22 20:24:56 -06:00
|
|
|
d += e
|
2018-08-06 03:50:38 -06:00
|
|
|
}
|
|
|
|
return d, e
|
|
|
|
}
|
|
|
|
|
2018-03-15 03:06:37 -06:00
|
|
|
// Check that len() and cap() calls divided by powers of two are
|
|
|
|
// optimized into shifts and ands
|
|
|
|
|
|
|
|
func LenDiv1(a []int) int {
|
|
|
|
// 386:"SHRL\t[$]10"
|
|
|
|
// amd64:"SHRQ\t[$]10"
|
2018-11-30 02:30:36 -07:00
|
|
|
// arm64:"LSR\t[$]10",-"SDIV"
|
|
|
|
// arm:"SRL\t[$]10",-".*udiv"
|
2018-10-15 10:53:07 -06:00
|
|
|
// ppc64:"SRD"\t[$]10"
|
|
|
|
// ppc64le:"SRD"\t[$]10"
|
2018-03-15 03:06:37 -06:00
|
|
|
return len(a) / 1024
|
|
|
|
}
|
|
|
|
|
|
|
|
func LenDiv2(s string) int {
|
|
|
|
// 386:"SHRL\t[$]11"
|
|
|
|
// amd64:"SHRQ\t[$]11"
|
2018-11-30 02:30:36 -07:00
|
|
|
// arm64:"LSR\t[$]11",-"SDIV"
|
|
|
|
// arm:"SRL\t[$]11",-".*udiv"
|
2018-10-15 10:53:07 -06:00
|
|
|
// ppc64:"SRD\t[$]11"
|
|
|
|
// ppc64le:"SRD\t[$]11"
|
2018-03-15 03:06:37 -06:00
|
|
|
return len(s) / (4097 >> 1)
|
|
|
|
}
|
|
|
|
|
|
|
|
func LenMod1(a []int) int {
|
|
|
|
// 386:"ANDL\t[$]1023"
|
|
|
|
// amd64:"ANDQ\t[$]1023"
|
2018-11-30 02:30:36 -07:00
|
|
|
// arm64:"AND\t[$]1023",-"SDIV"
|
|
|
|
// arm/6:"AND",-".*udiv"
|
|
|
|
// arm/7:"BFC",-".*udiv",-"AND"
|
2018-10-15 10:53:07 -06:00
|
|
|
// ppc64:"ANDCC\t[$]1023"
|
|
|
|
// ppc64le:"ANDCC\t[$]1023"
|
2018-03-15 03:06:37 -06:00
|
|
|
return len(a) % 1024
|
|
|
|
}
|
|
|
|
|
|
|
|
func LenMod2(s string) int {
|
|
|
|
// 386:"ANDL\t[$]2047"
|
|
|
|
// amd64:"ANDQ\t[$]2047"
|
2018-11-30 02:30:36 -07:00
|
|
|
// arm64:"AND\t[$]2047",-"SDIV"
|
|
|
|
// arm/6:"AND",-".*udiv"
|
|
|
|
// arm/7:"BFC",-".*udiv",-"AND"
|
2018-10-15 10:53:07 -06:00
|
|
|
// ppc64:"ANDCC\t[$]2047"
|
|
|
|
// ppc64le:"ANDCC\t[$]2047"
|
2018-03-15 03:06:37 -06:00
|
|
|
return len(s) % (4097 >> 1)
|
|
|
|
}
|
|
|
|
|
|
|
|
func CapDiv(a []int) int {
|
|
|
|
// 386:"SHRL\t[$]12"
|
|
|
|
// amd64:"SHRQ\t[$]12"
|
2018-11-30 02:30:36 -07:00
|
|
|
// arm64:"LSR\t[$]12",-"SDIV"
|
|
|
|
// arm:"SRL\t[$]12",-".*udiv"
|
2018-10-15 10:53:07 -06:00
|
|
|
// ppc64:"SRD\t[$]12"
|
|
|
|
// ppc64le:"SRD\t[$]12"
|
2018-03-15 03:06:37 -06:00
|
|
|
return cap(a) / ((1 << 11) + 2048)
|
|
|
|
}
|
|
|
|
|
|
|
|
func CapMod(a []int) int {
|
|
|
|
// 386:"ANDL\t[$]4095"
|
|
|
|
// amd64:"ANDQ\t[$]4095"
|
2018-11-30 02:30:36 -07:00
|
|
|
// arm64:"AND\t[$]4095",-"SDIV"
|
|
|
|
// arm/6:"AND",-".*udiv"
|
|
|
|
// arm/7:"BFC",-".*udiv",-"AND"
|
2018-10-15 10:53:07 -06:00
|
|
|
// ppc64:"ANDCC\t[$]4095"
|
|
|
|
// ppc64le:"ANDCC\t[$]4095"
|
2018-03-15 03:06:37 -06:00
|
|
|
return cap(a) % ((1 << 11) + 2048)
|
|
|
|
}
|
2018-04-23 14:49:51 -06:00
|
|
|
|
|
|
|
func AddMul(x int) int {
|
|
|
|
// amd64:"LEAQ\t1"
|
|
|
|
return 2*x + 1
|
|
|
|
}
|
2018-08-13 04:38:25 -06:00
|
|
|
|
2018-09-27 07:21:03 -06:00
|
|
|
func MULA(a, b, c uint32) (uint32, uint32, uint32) {
|
|
|
|
// arm:`MULA`,-`MUL\s`
|
|
|
|
// arm64:`MADDW`,-`MULW`
|
|
|
|
r0 := a*b + c
|
2018-10-29 03:01:14 -06:00
|
|
|
// arm:`MULA`,-`MUL\s`
|
2018-09-27 07:21:03 -06:00
|
|
|
// arm64:`MADDW`,-`MULW`
|
|
|
|
r1 := c*79 + a
|
2018-10-29 03:01:14 -06:00
|
|
|
// arm:`ADD`,-`MULA`,-`MUL\s`
|
2018-09-27 07:21:03 -06:00
|
|
|
// arm64:`ADD`,-`MADD`,-`MULW`
|
|
|
|
r2 := b*64 + c
|
|
|
|
return r0, r1, r2
|
2018-08-13 04:38:25 -06:00
|
|
|
}
|
2018-10-14 20:25:10 -06:00
|
|
|
|
|
|
|
func MULS(a, b, c uint32) (uint32, uint32, uint32) {
|
|
|
|
// arm/7:`MULS`,-`MUL\s`
|
2018-10-29 03:01:14 -06:00
|
|
|
// arm/6:`SUB`,`MUL\s`,-`MULS`
|
2018-10-14 20:25:10 -06:00
|
|
|
// arm64:`MSUBW`,-`MULW`
|
|
|
|
r0 := c - a*b
|
2018-10-29 03:01:14 -06:00
|
|
|
// arm/7:`MULS`,-`MUL\s`
|
|
|
|
// arm/6:`SUB`,`MUL\s`,-`MULS`
|
2018-10-14 20:25:10 -06:00
|
|
|
// arm64:`MSUBW`,-`MULW`
|
|
|
|
r1 := a - c*79
|
2018-10-29 03:01:14 -06:00
|
|
|
// arm/7:`SUB`,-`MULS`,-`MUL\s`
|
2018-10-14 20:25:10 -06:00
|
|
|
// arm64:`SUB`,-`MSUBW`,-`MULW`
|
|
|
|
r2 := c - b*64
|
|
|
|
return r0, r1, r2
|
|
|
|
}
|
2019-03-19 13:26:22 -06:00
|
|
|
|
|
|
|
func addSpecial(a, b, c uint32) (uint32, uint32, uint32) {
|
|
|
|
// amd64:`INCL`
|
|
|
|
a++
|
|
|
|
// amd64:`DECL`
|
|
|
|
b--
|
|
|
|
// amd64:`SUBL.*-128`
|
|
|
|
c += 128
|
|
|
|
return a, b, c
|
|
|
|
}
|
2020-05-07 14:44:51 -06:00
|
|
|
|
|
|
|
// Divide -> shift rules usually require fixup for negative inputs.
|
|
|
|
// If the input is non-negative, make sure the fixup is eliminated.
|
|
|
|
func divInt(v int64) int64 {
|
|
|
|
if v < 0 {
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
// amd64:-`.*SARQ.*63,`, -".*SHRQ", ".*SARQ.*[$]9,"
|
|
|
|
return v / 512
|
|
|
|
}
|
cmd/compile: add more generic rewrite rules to reassociate (op (op y C) x|C)
With this patch, opt pass can expose more obvious constant-folding
opportunites.
Example:
func test(i int) int {return (i+8)-(i+4)}
The previous version:
MOVD "".i(FP), R0
ADD $8, R0, R1
ADD $4, R0, R0
SUB R0, R1, R0
MOVD R0, "".~r1+8(FP)
RET (R30)
The optimized version:
MOVD $4, R0
MOVD R0, "".~r1+8(FP)
RET (R30)
This patch removes some existing reassociation rules, such as "x+(z-C)",
because the current generic rewrite rules will canonicalize "x-const"
to "x+(-const)", making "x+(z-C)" equal to "x+(z+(-C))".
This patch also adds test cases.
Change-Id: I857108ba0b5fcc18a879eeab38e2551bc4277797
Reviewed-on: https://go-review.googlesource.com/c/go/+/237137
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-05-28 04:11:52 -06:00
|
|
|
|
|
|
|
// The reassociate rules "x - (z + C) -> (x - z) - C" and
|
|
|
|
// "(z + C) -x -> C + (z - x)" can optimize the following cases.
|
|
|
|
func constantFold1(i0, j0, i1, j1, i2, j2, i3, j3 int) (int, int, int, int) {
|
|
|
|
// arm64:"SUB","ADD\t[$]2"
|
|
|
|
r0 := (i0 + 3) - (j0 + 1)
|
|
|
|
// arm64:"SUB","SUB\t[$]4"
|
|
|
|
r1 := (i1 - 3) - (j1 + 1)
|
|
|
|
// arm64:"SUB","ADD\t[$]4"
|
|
|
|
r2 := (i2 + 3) - (j2 - 1)
|
|
|
|
// arm64:"SUB","SUB\t[$]2"
|
|
|
|
r3 := (i3 - 3) - (j3 - 1)
|
|
|
|
return r0, r1, r2, r3
|
|
|
|
}
|
|
|
|
|
|
|
|
// The reassociate rules "x - (z + C) -> (x - z) - C" and
|
|
|
|
// "(C - z) - x -> C - (z + x)" can optimize the following cases.
|
|
|
|
func constantFold2(i0, j0, i1, j1 int) (int, int) {
|
|
|
|
// arm64:"ADD","MOVD\t[$]2","SUB"
|
|
|
|
r0 := (3 - i0) - (j0 + 1)
|
|
|
|
// arm64:"ADD","MOVD\t[$]4","SUB"
|
|
|
|
r1 := (3 - i1) - (j1 - 1)
|
|
|
|
return r0, r1
|
|
|
|
}
|
|
|
|
|
|
|
|
func constantFold3(i, j int) int {
|
|
|
|
// arm64: "MOVD\t[$]30","MUL",-"ADD",-"LSL"
|
|
|
|
r := (5 * i) * (6 * j)
|
|
|
|
return r
|
|
|
|
}
|