1
0
mirror of https://github.com/golang/go synced 2024-11-06 00:16:10 -07:00
go/test/codegen/memcombine.go
Ben Shi aaf73c6d1e cmd/compile: optimize ARM64 with shifted register indexed load/store
ARM64 supports efficient instructions which combine shift, addition, load/store
together. Such as "MOVD (R0)(R1<<3), R2" and "MOVWU R6, (R4)(R1<<2)".

This CL optimizes the compiler to emit such efficient instuctions. And below
is some test data.

1. binary size before/after
binary                 size change
pkg/linux_arm64        +80.1KB
pkg/tool/linux_arm64   +121.9KB
go                     -4.3KB
gofmt                  -64KB

2. go1 benchmark
There is big improvement for the test case Fannkuch11, and slight
improvement for sme others, excluding noise.

name                     old time/op    new time/op    delta
BinaryTree17-4              43.9s ± 2%     44.0s ± 2%     ~     (p=0.820 n=30+30)
Fannkuch11-4                30.6s ± 2%     24.5s ± 3%  -19.93%  (p=0.000 n=25+30)
FmtFprintfEmpty-4           500ns ± 0%     499ns ± 0%   -0.11%  (p=0.000 n=23+25)
FmtFprintfString-4         1.03µs ± 0%    1.04µs ± 3%     ~     (p=0.065 n=29+30)
FmtFprintfInt-4            1.15µs ± 3%    1.15µs ± 4%   -0.56%  (p=0.000 n=30+30)
FmtFprintfIntInt-4         1.80µs ± 5%    1.82µs ± 0%     ~     (p=0.094 n=30+24)
FmtFprintfPrefixedInt-4    2.17µs ± 5%    2.20µs ± 0%     ~     (p=0.100 n=30+23)
FmtFprintfFloat-4          3.08µs ± 3%    3.09µs ± 4%     ~     (p=0.123 n=30+30)
FmtManyArgs-4              7.41µs ± 4%    7.17µs ± 1%   -3.26%  (p=0.000 n=30+23)
GobDecode-4                93.7ms ± 0%    94.7ms ± 4%     ~     (p=0.685 n=24+30)
GobEncode-4                78.7ms ± 7%    77.1ms ± 0%     ~     (p=0.729 n=30+23)
Gzip-4                      4.01s ± 0%     3.97s ± 5%   -1.11%  (p=0.037 n=24+30)
Gunzip-4                    389ms ± 4%     384ms ± 0%     ~     (p=0.155 n=30+23)
HTTPClientServer-4          536µs ± 1%     537µs ± 1%     ~     (p=0.236 n=30+30)
JSONEncode-4                179ms ± 1%     182ms ± 6%     ~     (p=0.763 n=24+30)
JSONDecode-4                843ms ± 0%     839ms ± 6%   -0.42%  (p=0.003 n=25+30)
Mandelbrot200-4            46.5ms ± 0%    46.5ms ± 0%   +0.02%  (p=0.000 n=26+26)
GoParse-4                  44.3ms ± 6%    43.3ms ± 0%     ~     (p=0.067 n=30+27)
RegexpMatchEasy0_32-4      1.07µs ± 7%    1.07µs ± 4%     ~     (p=0.835 n=30+30)
RegexpMatchEasy0_1K-4      5.51µs ± 0%    5.49µs ± 0%   -0.35%  (p=0.000 n=23+26)
RegexpMatchEasy1_32-4      1.01µs ± 0%    1.02µs ± 4%   +0.96%  (p=0.014 n=24+30)
RegexpMatchEasy1_1K-4      7.43µs ± 0%    7.18µs ± 0%   -3.41%  (p=0.000 n=23+24)
RegexpMatchMedium_32-4     1.78µs ± 0%    1.81µs ± 4%   +1.47%  (p=0.012 n=23+30)
RegexpMatchMedium_1K-4      547µs ± 1%     542µs ± 3%   -0.90%  (p=0.003 n=24+30)
RegexpMatchHard_32-4       30.4µs ± 0%    29.7µs ± 0%   -2.15%  (p=0.000 n=19+23)
RegexpMatchHard_1K-4        913µs ± 0%     915µs ± 6%   +0.25%  (p=0.012 n=24+30)
Revcomp-4                   6.32s ± 1%     6.42s ± 4%     ~     (p=0.342 n=25+30)
Template-4                  868ms ± 6%     878ms ± 6%   +1.15%  (p=0.000 n=30+30)
TimeParse-4                4.57µs ± 4%    4.59µs ± 3%   +0.65%  (p=0.010 n=29+30)
TimeFormat-4               4.51µs ± 0%    4.50µs ± 0%   -0.27%  (p=0.000 n=27+24)
[Geo mean]                  695µs          689µs        -0.92%

name                     old speed      new speed      delta
GobDecode-4              8.19MB/s ± 0%  8.12MB/s ± 4%     ~     (p=0.680 n=24+30)
GobEncode-4              9.76MB/s ± 7%  9.96MB/s ± 0%     ~     (p=0.616 n=30+23)
Gzip-4                   4.84MB/s ± 0%  4.89MB/s ± 4%   +1.16%  (p=0.030 n=24+30)
Gunzip-4                 49.9MB/s ± 4%  50.6MB/s ± 0%     ~     (p=0.162 n=30+23)
JSONEncode-4             10.9MB/s ± 1%  10.7MB/s ± 6%     ~     (p=0.575 n=24+30)
JSONDecode-4             2.30MB/s ± 0%  2.32MB/s ± 5%   +0.72%  (p=0.003 n=22+30)
GoParse-4                1.31MB/s ± 6%  1.34MB/s ± 0%   +2.26%  (p=0.002 n=30+27)
RegexpMatchEasy0_32-4    30.0MB/s ± 6%  30.0MB/s ± 4%     ~     (p=1.000 n=30+30)
RegexpMatchEasy0_1K-4     186MB/s ± 0%   187MB/s ± 0%   +0.35%  (p=0.000 n=23+26)
RegexpMatchEasy1_32-4    31.8MB/s ± 0%  31.5MB/s ± 4%   -0.92%  (p=0.012 n=25+30)
RegexpMatchEasy1_1K-4     138MB/s ± 0%   143MB/s ± 0%   +3.53%  (p=0.000 n=23+24)
RegexpMatchMedium_32-4    560kB/s ± 0%   553kB/s ± 4%   -1.19%  (p=0.005 n=23+30)
RegexpMatchMedium_1K-4   1.87MB/s ± 0%  1.89MB/s ± 3%   +1.04%  (p=0.002 n=24+30)
RegexpMatchHard_32-4     1.05MB/s ± 0%  1.08MB/s ± 0%   +2.40%  (p=0.000 n=19+23)
RegexpMatchHard_1K-4     1.12MB/s ± 0%  1.12MB/s ± 5%   +0.12%  (p=0.006 n=25+30)
Revcomp-4                40.2MB/s ± 1%  39.6MB/s ± 4%     ~     (p=0.242 n=25+30)
Template-4               2.24MB/s ± 6%  2.21MB/s ± 6%   -1.15%  (p=0.000 n=30+30)
[Geo mean]               7.87MB/s       7.91MB/s        +0.44%

Change-Id: If374cb7abf83537aa0a176f73c0f736f7800db03
Reviewed-on: https://go-review.googlesource.com/108735
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-04-27 20:02:05 +00:00

355 lines
9.4 KiB
Go

// asmcheck
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package codegen
import (
"encoding/binary"
"runtime"
)
var sink64 uint64
var sink32 uint32
var sink16 uint16
// ------------- //
// Loading //
// ------------- //
func load_le64(b []byte) {
// amd64:`MOVQ\s\(.*\),`
// s390x:`MOVDBR\s\(.*\),`
// arm64:`MOVD\s\(R[0-9]+\),`,-`MOV[BHW]`
// ppc64le:`MOVD\s`,-`MOV[BHW]Z`
sink64 = binary.LittleEndian.Uint64(b)
}
func load_le64_idx(b []byte, idx int) {
// amd64:`MOVQ\s\(.*\)\(.*\*1\),`
// s390x:`MOVDBR\s\(.*\)\(.*\*1\),`
// arm64:`MOVD\s\(R[0-9]+\)\(R[0-9]+\),`,-`MOV[BHW]`
// ppc64le:`MOVD\s`,-`MOV[BHW]Z\s`
sink64 = binary.LittleEndian.Uint64(b[idx:])
}
func load_le32(b []byte) {
// amd64:`MOVL\s\(.*\),` 386:`MOVL\s\(.*\),`
// s390x:`MOVWBR\s\(.*\),`
// arm64:`MOVWU\s\(R[0-9]+\),`,-`MOV[BH]`
// ppc64le:`MOVWZ\s`
sink32 = binary.LittleEndian.Uint32(b)
}
func load_le32_idx(b []byte, idx int) {
// amd64:`MOVL\s\(.*\)\(.*\*1\),` 386:`MOVL\s\(.*\)\(.*\*1\),`
// s390x:`MOVWBR\s\(.*\)\(.*\*1\),`
// arm64:`MOVWU\s\(R[0-9]+\)\(R[0-9]+\),`,-`MOV[BH]`
// ppc64le:`MOVWZ\s`
sink32 = binary.LittleEndian.Uint32(b[idx:])
}
func load_le16(b []byte) {
// amd64:`MOVWLZX\s\(.*\),`
// ppc64le:`MOVHZ\s`
// arm64:`MOVHU\s\(R[0-9]+\),`,-`MOVB`
sink16 = binary.LittleEndian.Uint16(b)
}
func load_le16_idx(b []byte, idx int) {
// amd64:`MOVWLZX\s\(.*\),`
// ppc64le:`MOVHZ\s`
// arm64:`MOVHU\s\(R[0-9]+\)\(R[0-9]+\),`,-`MOVB`
sink16 = binary.LittleEndian.Uint16(b[idx:])
}
func load_be64(b []byte) {
// amd64:`BSWAPQ`
// s390x:`MOVD\s\(.*\),`
// arm64:`REV`,`MOVD\s\(R[0-9]+\),`,-`MOV[BHW]`,-`REVW`,-`REV16W`
sink64 = binary.BigEndian.Uint64(b)
}
func load_be64_idx(b []byte, idx int) {
// amd64:`BSWAPQ`
// s390x:`MOVD\s\(.*\)\(.*\*1\),`
// arm64:`REV`,`MOVD\s\(R[0-9]+\)\(R[0-9]+\),`,-`MOV[WHB]`,-`REVW`,-`REV16W`
sink64 = binary.BigEndian.Uint64(b[idx:])
}
func load_be32(b []byte) {
// amd64:`BSWAPL`
// s390x:`MOVWZ\s\(.*\),`
// arm64:`REVW`,`MOVWU\s\(R[0-9]+\),`,-`MOV[BH]`,-`REV16W`
sink32 = binary.BigEndian.Uint32(b)
}
func load_be32_idx(b []byte, idx int) {
// amd64:`BSWAPL`
// s390x:`MOVWZ\s\(.*\)\(.*\*1\),`
// arm64:`REVW`,`MOVWU\s\(R[0-9]+\)\(R[0-9]+\),`,-`MOV[HB]`,-`REV16W`
sink32 = binary.BigEndian.Uint32(b[idx:])
}
func load_be16(b []byte) {
// amd64:`ROLW\s\$8`
// arm64: `REV16W`,`MOVHU\s\(R[0-9]+\),`,-`MOVB`
sink16 = binary.BigEndian.Uint16(b)
}
func load_be16_idx(b []byte, idx int) {
// amd64:`ROLW\s\$8`
// arm64: `REV16W`,`MOVHU\s\(R[0-9]+\)\(R[0-9]+\),`,-`MOVB`
sink16 = binary.BigEndian.Uint16(b[idx:])
}
func load_byte2_uint16(s []byte) uint16 {
// arm64:`MOVHU\t\(R[0-9]+\)`,-`ORR\tR[0-9]+<<8`
return uint16(s[0]) | uint16(s[1])<<8
}
func load_byte2_uint16_idx(s []byte, idx int) uint16 {
// arm64:`MOVHU\s\(R[0-9]+\)\(R[0-9]+<<1\)`,-`ORR`,-`MOVB`
return uint16(s[idx<<1]) | uint16(s[(idx<<1)+1])<<8
}
func load_byte4_uint32_idx(s []byte, idx int) uint32 {
// arm64:`MOVWU\s\(R[0-9]+\)\(R[0-9]+<<2\)`,-`ORR`,-`MOV[BH]`
return uint32(s[idx<<2]) | uint32(s[(idx<<2)+1])<<8 | uint32(s[(idx<<2)+2])<<16 | uint32(s[(idx<<2)+3])<<24
}
func load_byte8_uint64_idx(s []byte, idx int) uint64 {
// arm64:`MOVD\s\(R[0-9]+\)\(R[0-9]+<<3\)`,-`ORR`,-`MOV[BHW]`
return uint64(s[idx<<3]) | uint64(s[(idx<<3)+1])<<8 | uint64(s[(idx<<3)+2])<<16 | uint64(s[(idx<<3)+3])<<24 | uint64(s[(idx<<3)+4])<<32 | uint64(s[(idx<<3)+5])<<40 | uint64(s[(idx<<3)+6])<<48 | uint64(s[(idx<<3)+7])<<56
}
// Check load combining across function calls.
func fcall_byte(a, b byte) (byte, byte) {
return fcall_byte(fcall_byte(a, b)) // amd64:`MOVW`
}
func fcall_uint16(a, b uint16) (uint16, uint16) {
return fcall_uint16(fcall_uint16(a, b)) // amd64:`MOVL`
}
func fcall_uint32(a, b uint32) (uint32, uint32) {
return fcall_uint32(fcall_uint32(a, b)) // amd64:`MOVQ`
}
// We want to merge load+op in the first function, but not in the
// second. See Issue 19595.
func load_op_merge(p, q *int) {
x := *p
*q += x // amd64:`ADDQ\t\(`
}
func load_op_no_merge(p, q *int) {
x := *p
for i := 0; i < 10; i++ {
*q += x // amd64:`ADDQ\t[A-Z]`
}
}
// Make sure offsets are folded into loads and stores.
func offsets_fold(_, a [20]byte) (b [20]byte) {
// arm64:`MOVD\t""\.a\+[0-9]+\(FP\), R[0-9]+`,`MOVD\tR[0-9]+, ""\.b\+[0-9]+\(FP\)`
b = a
return
}
// Make sure we don't put pointers in SSE registers across safe
// points.
func safe_point(p, q *[2]*int) {
a, b := p[0], p[1] // amd64:-`MOVUPS`
runtime.GC()
q[0], q[1] = a, b // amd64:-`MOVUPS`
}
// ------------- //
// Storing //
// ------------- //
func store_le64(b []byte) {
// amd64:`MOVQ\s.*\(.*\)$`,-`SHR.`
// arm64:`MOVD`,-`MOV[WBH]`
// ppc64le:`MOVD\s`,-`MOV[BHW]\s`
binary.LittleEndian.PutUint64(b, sink64)
}
func store_le64_idx(b []byte, idx int) {
// amd64:`MOVQ\s.*\(.*\)\(.*\*1\)$`,-`SHR.`
// arm64:`MOVD\sR[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`,-`MOV[BHW]`
// ppc64le:`MOVD\s`,-`MOV[BHW]\s`
binary.LittleEndian.PutUint64(b[idx:], sink64)
}
func store_le32(b []byte) {
// amd64:`MOVL\s`
// arm64:`MOVW`,-`MOV[BH]`
// ppc64le:`MOVW\s`
binary.LittleEndian.PutUint32(b, sink32)
}
func store_le32_idx(b []byte, idx int) {
// amd64:`MOVL\s`
// arm64:`MOVW\sR[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`,-`MOV[BH]`
// ppc64le:`MOVW\s`
binary.LittleEndian.PutUint32(b[idx:], sink32)
}
func store_le16(b []byte) {
// amd64:`MOVW\s`
// arm64:`MOVH`,-`MOVB`
// ppc64le(DISABLED):`MOVH\s`
binary.LittleEndian.PutUint16(b, sink16)
}
func store_le16_idx(b []byte, idx int) {
// amd64:`MOVW\s`
// arm64:`MOVH\sR[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`,-`MOVB`
// ppc64le(DISABLED):`MOVH\s`
binary.LittleEndian.PutUint16(b[idx:], sink16)
}
func store_be64(b []byte) {
// amd64:`BSWAPQ`,-`SHR.`
// arm64:`MOVD`,`REV`,-`MOV[WBH]`,-`REVW`,-`REV16W`
binary.BigEndian.PutUint64(b, sink64)
}
func store_be64_idx(b []byte, idx int) {
// amd64:`BSWAPQ`,-`SHR.`
// arm64:`REV`,`MOVD\sR[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`,-`MOV[BHW]`,-`REV16W`,-`REVW`
binary.BigEndian.PutUint64(b[idx:], sink64)
}
func store_be32(b []byte) {
// amd64:`BSWAPL`,-`SHR.`
// arm64:`MOVW`,`REVW`,-`MOV[BH]`,-`REV16W`
binary.BigEndian.PutUint32(b, sink32)
}
func store_be32_idx(b []byte, idx int) {
// amd64:`BSWAPL`,-`SHR.`
// arm64:`REVW`,`MOVW\sR[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`,-`MOV[BH]`,-`REV16W`
binary.BigEndian.PutUint32(b[idx:], sink32)
}
func store_be16(b []byte) {
// amd64:`ROLW\s\$8`,-`SHR.`
// arm64:`MOVH`,`REV16W`,-`MOVB`
binary.BigEndian.PutUint16(b, sink16)
}
func store_be16_idx(b []byte, idx int) {
// amd64:`ROLW\s\$8`,-`SHR.`
// arm64:`MOVH\sR[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`,`REV16W`,-`MOVB`
binary.BigEndian.PutUint16(b[idx:], sink16)
}
// ------------- //
// Zeroing //
// ------------- //
// Check that zero stores are combined into larger stores
func zero_byte_2(b1, b2 []byte) {
// bounds checks to guarantee safety of writes below
_, _ = b1[1], b2[1]
b1[0], b1[1] = 0, 0 // arm64:"MOVH\tZR",-"MOVB"
b2[1], b2[0] = 0, 0 // arm64:"MOVH\tZR",-"MOVB"
}
func zero_byte_4(b1, b2 []byte) {
_, _ = b1[3], b2[3]
b1[0], b1[1], b1[2], b1[3] = 0, 0, 0, 0 // arm64:"MOVW\tZR",-"MOVB",-"MOVH"
b2[2], b2[3], b2[1], b2[0] = 0, 0, 0, 0 // arm64:"MOVW\tZR",-"MOVB",-"MOVH"
}
func zero_byte_8(b []byte) {
_ = b[7]
b[0], b[1], b[2], b[3] = 0, 0, 0, 0
b[4], b[5], b[6], b[7] = 0, 0, 0, 0 // arm64:"MOVD\tZR",-"MOVB",-"MOVH",-"MOVW"
}
func zero_byte_16(b []byte) {
_ = b[15]
b[0], b[1], b[2], b[3] = 0, 0, 0, 0
b[4], b[5], b[6], b[7] = 0, 0, 0, 0
b[8], b[9], b[10], b[11] = 0, 0, 0, 0
b[12], b[13], b[14], b[15] = 0, 0, 0, 0 // arm64:"STP",-"MOVB",-"MOVH",-"MOVW"
}
func zero_byte_2_idx(b []byte, idx int) {
// arm64: `MOVH\sZR,\s\(R[0-9]+\)\(R[0-9]+<<1\)`,-`MOVB`
b[(idx<<1)+0] = 0
b[(idx<<1)+1] = 0
}
func zero_byte_4_idx(b []byte, idx int) {
// arm64: `MOVW\sZR,\s\(R[0-9]+\)\(R[0-9]+<<2\)`,-`MOV[BH]`
b[(idx<<2)+0] = 0
b[(idx<<2)+1] = 0
b[(idx<<2)+2] = 0
b[(idx<<2)+3] = 0
}
func zero_byte_8_idx(b []byte, idx int) {
// arm64: `MOVD\sZR,\s\(R[0-9]+\)\(R[0-9]+<<3\)`,-`MOV[BHW]`
b[(idx<<3)+0] = 0
b[(idx<<3)+1] = 0
b[(idx<<3)+2] = 0
b[(idx<<3)+3] = 0
b[(idx<<3)+4] = 0
b[(idx<<3)+5] = 0
b[(idx<<3)+6] = 0
b[(idx<<3)+7] = 0
}
func zero_byte_30(a *[30]byte) {
*a = [30]byte{} // arm64:"STP",-"MOVB",-"MOVH",-"MOVW"
}
func zero_byte_39(a *[39]byte) {
*a = [39]byte{} // arm64:"MOVD",-"MOVB",-"MOVH",-"MOVW"
}
func zero_uint16_2(h1, h2 []uint16) {
_, _ = h1[1], h2[1]
h1[0], h1[1] = 0, 0 // arm64:"MOVW\tZR",-"MOVB",-"MOVH"
h2[1], h2[0] = 0, 0 // arm64:"MOVW\tZR",-"MOVB",-"MOVH"
}
func zero_uint16_4(h1, h2 []uint16) {
_, _ = h1[3], h2[3]
h1[0], h1[1], h1[2], h1[3] = 0, 0, 0, 0 // arm64:"MOVD\tZR",-"MOVB",-"MOVH",-"MOVW"
h2[2], h2[3], h2[1], h2[0] = 0, 0, 0, 0 // arm64:"MOVD\tZR",-"MOVB",-"MOVH",-"MOVW"
}
func zero_uint16_8(h []uint16) {
_ = h[7]
h[0], h[1], h[2], h[3] = 0, 0, 0, 0
h[4], h[5], h[6], h[7] = 0, 0, 0, 0 // arm64:"STP",-"MOVB",-"MOVH"
}
func zero_uint32_2(w1, w2 []uint32) {
_, _ = w1[1], w2[1]
w1[0], w1[1] = 0, 0 // arm64:"MOVD\tZR",-"MOVB",-"MOVH",-"MOVW"
w2[1], w2[0] = 0, 0 // arm64:"MOVD\tZR",-"MOVB",-"MOVH",-"MOVW"
}
func zero_uint32_4(w1, w2 []uint32) {
_, _ = w1[3], w2[3]
w1[0], w1[1], w1[2], w1[3] = 0, 0, 0, 0 // arm64:"STP",-"MOVB",-"MOVH"
w2[2], w2[3], w2[1], w2[0] = 0, 0, 0, 0 // arm64:"STP",-"MOVB",-"MOVH"
}
func zero_uint64_2(d1, d2 []uint64) {
_, _ = d1[1], d2[1]
d1[0], d1[1] = 0, 0 // arm64:"STP",-"MOVB",-"MOVH"
d2[1], d2[0] = 0, 0 // arm64:"STP",-"MOVB",-"MOVH"
}