1
0
mirror of https://github.com/golang/go synced 2024-11-12 10:20:27 -07:00
go/test/codegen/arithmetic.go
Ben Shi b444215116 cmd/compile: optimize ARM64's code with MADD/MSUB
MADD does MUL-ADD in a single instruction, and MSUB does the
similiar simplification for MUL-SUB.

The CL implements the optimization with MADD/MSUB.

1. The total size of pkg/android_arm64/ decreases about 20KB,
excluding cmd/compile/.

2. The go1 benchmark shows a little improvement for RegexpMatchHard_32-4
and Template-4, excluding noise.

name                     old time/op    new time/op    delta
BinaryTree17-4              16.3s ± 1%     16.5s ± 1%  +1.41%  (p=0.000 n=26+28)
Fannkuch11-4                8.79s ± 1%     8.76s ± 0%  -0.36%  (p=0.000 n=26+28)
FmtFprintfEmpty-4           172ns ± 0%     172ns ± 0%    ~     (all equal)
FmtFprintfString-4          362ns ± 1%     364ns ± 0%  +0.55%  (p=0.000 n=30+30)
FmtFprintfInt-4             416ns ± 0%     416ns ± 0%    ~     (p=0.099 n=22+30)
FmtFprintfIntInt-4          655ns ± 1%     660ns ± 1%  +0.76%  (p=0.000 n=30+30)
FmtFprintfPrefixedInt-4     810ns ± 0%     809ns ± 0%  -0.08%  (p=0.009 n=29+29)
FmtFprintfFloat-4          1.08µs ± 0%    1.09µs ± 0%  +0.61%  (p=0.000 n=30+29)
FmtManyArgs-4              2.70µs ± 0%    2.69µs ± 0%  -0.23%  (p=0.000 n=29+28)
GobDecode-4                32.2ms ± 1%    32.1ms ± 1%  -0.39%  (p=0.000 n=27+26)
GobEncode-4                27.4ms ± 2%    27.4ms ± 1%    ~     (p=0.864 n=28+28)
Gzip-4                      1.53s ± 1%     1.52s ± 1%  -0.30%  (p=0.031 n=29+29)
Gunzip-4                    146ms ± 0%     146ms ± 0%  -0.14%  (p=0.001 n=25+30)
HTTPClientServer-4         1.00ms ± 4%    0.98ms ± 6%  -1.65%  (p=0.001 n=29+30)
JSONEncode-4               67.3ms ± 1%    67.2ms ± 1%    ~     (p=0.520 n=28+28)
JSONDecode-4                329ms ± 5%     330ms ± 4%    ~     (p=0.142 n=30+30)
Mandelbrot200-4            17.3ms ± 0%    17.3ms ± 0%    ~     (p=0.055 n=26+29)
GoParse-4                  16.9ms ± 1%    17.0ms ± 1%  +0.82%  (p=0.000 n=30+30)
RegexpMatchEasy0_32-4       382ns ± 0%     382ns ± 0%    ~     (all equal)
RegexpMatchEasy0_1K-4      1.33µs ± 0%    1.33µs ± 0%  -0.25%  (p=0.000 n=30+27)
RegexpMatchEasy1_32-4       361ns ± 0%     361ns ± 0%  -0.08%  (p=0.002 n=30+28)
RegexpMatchEasy1_1K-4      2.11µs ± 0%    2.09µs ± 0%  -0.54%  (p=0.000 n=30+29)
RegexpMatchMedium_32-4      594ns ± 0%     592ns ± 0%  -0.32%  (p=0.000 n=30+30)
RegexpMatchMedium_1K-4      173µs ± 0%     172µs ± 0%  -0.77%  (p=0.000 n=29+27)
RegexpMatchHard_32-4       10.4µs ± 0%    10.1µs ± 0%  -3.63%  (p=0.000 n=28+27)
RegexpMatchHard_1K-4        306µs ± 0%     301µs ± 0%  -1.64%  (p=0.000 n=29+30)
Revcomp-4                   2.51s ± 1%     2.52s ± 0%  +0.18%  (p=0.017 n=26+27)
Template-4                  394ms ± 3%     382ms ± 3%  -3.22%  (p=0.000 n=28+28)
TimeParse-4                1.67µs ± 0%    1.67µs ± 0%  +0.05%  (p=0.030 n=27+30)
TimeFormat-4               1.72µs ± 0%    1.70µs ± 0%  -0.79%  (p=0.000 n=28+26)
[Geo mean]                  259µs          259µs       -0.33%

name                     old speed      new speed      delta
GobDecode-4              23.8MB/s ± 1%  23.9MB/s ± 1%  +0.40%  (p=0.001 n=27+26)
GobEncode-4              28.0MB/s ± 2%  28.0MB/s ± 1%    ~     (p=0.863 n=28+28)
Gzip-4                   12.7MB/s ± 1%  12.7MB/s ± 1%  +0.32%  (p=0.026 n=29+29)
Gunzip-4                  133MB/s ± 0%   133MB/s ± 0%  +0.15%  (p=0.001 n=24+30)
JSONEncode-4             28.8MB/s ± 1%  28.9MB/s ± 1%    ~     (p=0.475 n=28+28)
JSONDecode-4             5.89MB/s ± 4%  5.87MB/s ± 5%    ~     (p=0.174 n=29+30)
GoParse-4                3.43MB/s ± 0%  3.40MB/s ± 1%  -0.83%  (p=0.000 n=28+30)
RegexpMatchEasy0_32-4    83.6MB/s ± 0%  83.6MB/s ± 0%    ~     (p=0.848 n=28+29)
RegexpMatchEasy0_1K-4     768MB/s ± 0%   770MB/s ± 0%  +0.25%  (p=0.000 n=30+27)
RegexpMatchEasy1_32-4    88.5MB/s ± 0%  88.5MB/s ± 0%    ~     (p=0.086 n=29+29)
RegexpMatchEasy1_1K-4     486MB/s ± 0%   489MB/s ± 0%  +0.54%  (p=0.000 n=30+29)
RegexpMatchMedium_32-4   1.68MB/s ± 0%  1.69MB/s ± 0%  +0.60%  (p=0.000 n=30+23)
RegexpMatchMedium_1K-4   5.90MB/s ± 0%  5.95MB/s ± 0%  +0.85%  (p=0.000 n=18+20)
RegexpMatchHard_32-4     3.07MB/s ± 0%  3.18MB/s ± 0%  +3.72%  (p=0.000 n=29+26)
RegexpMatchHard_1K-4     3.35MB/s ± 0%  3.40MB/s ± 0%  +1.69%  (p=0.000 n=30+30)
Revcomp-4                 101MB/s ± 0%   101MB/s ± 0%  -0.18%  (p=0.018 n=26+27)
Template-4               4.92MB/s ± 4%  5.09MB/s ± 3%  +3.31%  (p=0.000 n=28+28)
[Geo mean]               22.4MB/s       22.6MB/s       +0.62%

Change-Id: I8f304b272785739f57b3c8f736316f658f8c1b2a
Reviewed-on: https://go-review.googlesource.com/129119
Run-TryBot: Ben Shi <powerman1st@163.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2018-09-04 20:41:58 +00:00

214 lines
4.4 KiB
Go

// asmcheck
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package codegen
// This file contains codegen tests related to arithmetic
// simplifications and optimizations on integer types.
// For codegen tests on float types, see floats.go.
// ----------------- //
// Subtraction //
// ----------------- //
func SubMem(arr []int, b int) int {
// 386:`SUBL\s[A-Z]+,\s8\([A-Z]+\)`
// amd64:`SUBQ\s[A-Z]+,\s16\([A-Z]+\)`
arr[2] -= b
// 386:`SUBL\s[A-Z]+,\s12\([A-Z]+\)`
// amd64:`SUBQ\s[A-Z]+,\s24\([A-Z]+\)`
arr[3] -= b
// 386:`DECL\s16\([A-Z]+\)`
arr[4]--
// 386:`ADDL\s[$]-20,\s20\([A-Z]+\)`
arr[5] -= 20
// 386:"SUBL\t4"
// amd64:"SUBQ\t8"
return arr[0] - arr[1]
}
// -------------------- //
// Multiplication //
// -------------------- //
func Pow2Muls(n1, n2 int) (int, int) {
// amd64:"SHLQ\t[$]5",-"IMULQ"
// 386:"SHLL\t[$]5",-"IMULL"
// arm:"SLL\t[$]5",-"MUL"
// arm64:"LSL\t[$]5",-"MUL"
a := n1 * 32
// amd64:"SHLQ\t[$]6",-"IMULQ"
// 386:"SHLL\t[$]6",-"IMULL"
// arm:"SLL\t[$]6",-"MUL"
// arm64:"LSL\t[$]6",-"MUL"
b := -64 * n2
return a, b
}
func Mul_96(n int) int {
// amd64:`SHLQ\t[$]5`,`LEAQ\t\(.*\)\(.*\*2\),`
return n * 96
}
func MulMemSrc(a []uint32, b []float32) {
// 386:`IMULL\s4\([A-Z]+\),\s[A-Z]+`
a[0] *= a[1]
// 386/sse2:`MULSS\s4\([A-Z]+\),\sX[0-9]+`
b[0] *= b[1]
}
// Multiplications merging tests
func MergeMuls1(n int) int {
// amd64:"IMUL3Q\t[$]46"
// 386:"IMUL3L\t[$]46"
return 15*n + 31*n // 46n
}
func MergeMuls2(n int) int {
// amd64:"IMUL3Q\t[$]23","ADDQ\t[$]29"
// 386:"IMUL3L\t[$]23","ADDL\t[$]29"
return 5*n + 7*(n+1) + 11*(n+2) // 23n + 29
}
func MergeMuls3(a, n int) int {
// amd64:"ADDQ\t[$]19",-"IMULQ\t[$]19"
// 386:"ADDL\t[$]19",-"IMULL\t[$]19"
return a*n + 19*n // (a+19)n
}
func MergeMuls4(n int) int {
// amd64:"IMUL3Q\t[$]14"
// 386:"IMUL3L\t[$]14"
return 23*n - 9*n // 14n
}
func MergeMuls5(a, n int) int {
// amd64:"ADDQ\t[$]-19",-"IMULQ\t[$]19"
// 386:"ADDL\t[$]-19",-"IMULL\t[$]19"
return a*n - 19*n // (a-19)n
}
// -------------- //
// Division //
// -------------- //
func DivMemSrc(a []float64) {
// 386/sse2:`DIVSD\s8\([A-Z]+\),\sX[0-9]+`
a[0] /= a[1]
}
func Pow2Divs(n1 uint, n2 int) (uint, int) {
// 386:"SHRL\t[$]5",-"DIVL"
// amd64:"SHRQ\t[$]5",-"DIVQ"
// arm:"SRL\t[$]5",-".*udiv"
// arm64:"LSR\t[$]5",-"UDIV"
a := n1 / 32 // unsigned
// amd64:"SARQ\t[$]6",-"IDIVQ"
// 386:"SARL\t[$]6",-"IDIVL"
// arm:"SRA\t[$]6",-".*udiv"
// arm64:"ASR\t[$]6",-"SDIV"
b := n2 / 64 // signed
return a, b
}
// Check that constant divisions get turned into MULs
func ConstDivs(n1 uint, n2 int) (uint, int) {
// amd64:"MOVQ\t[$]-1085102592571150095","MULQ",-"DIVQ"
a := n1 / 17 // unsigned
// amd64:"MOVQ\t[$]-1085102592571150095","IMULQ",-"IDIVQ"
b := n2 / 17 // signed
return a, b
}
func FloatDivs(a []float32) float32 {
// amd64:`DIVSS\s8\([A-Z]+\),\sX[0-9]+`
return a[1] / a[2]
}
func Pow2Mods(n1 uint, n2 int) (uint, int) {
// 386:"ANDL\t[$]31",-"DIVL"
// amd64:"ANDQ\t[$]31",-"DIVQ"
// arm:"AND\t[$]31",-".*udiv"
// arm64:"AND\t[$]31",-"UDIV"
a := n1 % 32 // unsigned
// 386:-"IDIVL"
// amd64:-"IDIVQ"
// arm:-".*udiv"
// arm64:-"REM"
b := n2 % 64 // signed
return a, b
}
// Check that constant modulo divs get turned into MULs
func ConstMods(n1 uint, n2 int) (uint, int) {
// amd64:"MOVQ\t[$]-1085102592571150095","MULQ",-"DIVQ"
a := n1 % 17 // unsigned
// amd64:"MOVQ\t[$]-1085102592571150095","IMULQ",-"IDIVQ"
b := n2 % 17 // signed
return a, b
}
// Check that len() and cap() calls divided by powers of two are
// optimized into shifts and ands
func LenDiv1(a []int) int {
// 386:"SHRL\t[$]10"
// amd64:"SHRQ\t[$]10"
return len(a) / 1024
}
func LenDiv2(s string) int {
// 386:"SHRL\t[$]11"
// amd64:"SHRQ\t[$]11"
return len(s) / (4097 >> 1)
}
func LenMod1(a []int) int {
// 386:"ANDL\t[$]1023"
// amd64:"ANDQ\t[$]1023"
return len(a) % 1024
}
func LenMod2(s string) int {
// 386:"ANDL\t[$]2047"
// amd64:"ANDQ\t[$]2047"
return len(s) % (4097 >> 1)
}
func CapDiv(a []int) int {
// 386:"SHRL\t[$]12"
// amd64:"SHRQ\t[$]12"
return cap(a) / ((1 << 11) + 2048)
}
func CapMod(a []int) int {
// 386:"ANDL\t[$]4095"
// amd64:"ANDQ\t[$]4095"
return cap(a) % ((1 << 11) + 2048)
}
func AddMul(x int) int {
// amd64:"LEAQ\t1"
return 2*x + 1
}
func MULA(a, b, c uint32) uint32 {
// arm:`MULA`
// arm64:`MADDW`
return a*b + c
}