2018-02-26 17:59:58 -07:00
|
|
|
// asmcheck
|
|
|
|
|
2018-03-02 13:06:09 -07:00
|
|
|
// Copyright 2018 The Go Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
|
2018-02-26 17:59:58 -07:00
|
|
|
package codegen
|
|
|
|
|
2021-09-19 10:23:37 -06:00
|
|
|
import "math/bits"
|
|
|
|
|
cmd/compile: add patterns for bit set/clear/complement on amd64
This patch completes implementation of BT(Q|L), and adds support
for BT(S|R|C)(Q|L).
Example of code changes from time.(*Time).addSec:
if t.wall&hasMonotonic != 0 {
0x1073465 488b08 MOVQ 0(AX), CX
0x1073468 4889ca MOVQ CX, DX
0x107346b 48c1e93f SHRQ $0x3f, CX
0x107346f 48c1e13f SHLQ $0x3f, CX
0x1073473 48f7c1ffffffff TESTQ $-0x1, CX
0x107347a 746b JE 0x10734e7
if t.wall&hasMonotonic != 0 {
0x1073435 488b08 MOVQ 0(AX), CX
0x1073438 480fbae13f BTQ $0x3f, CX
0x107343d 7363 JAE 0x10734a2
Another example:
t.wall = t.wall&nsecMask | uint64(dsec)<<nsecShift | hasMonotonic
0x10734c8 4881e1ffffff3f ANDQ $0x3fffffff, CX
0x10734cf 48c1e61e SHLQ $0x1e, SI
0x10734d3 4809ce ORQ CX, SI
0x10734d6 48b90000000000000080 MOVQ $0x8000000000000000, CX
0x10734e0 4809f1 ORQ SI, CX
0x10734e3 488908 MOVQ CX, 0(AX)
t.wall = t.wall&nsecMask | uint64(dsec)<<nsecShift | hasMonotonic
0x107348b 4881e2ffffff3f ANDQ $0x3fffffff, DX
0x1073492 48c1e61e SHLQ $0x1e, SI
0x1073496 4809f2 ORQ SI, DX
0x1073499 480fbaea3f BTSQ $0x3f, DX
0x107349e 488910 MOVQ DX, 0(AX)
Go1 benchmarks seem unaffected, and I would be surprised
otherwise:
name old time/op new time/op delta
BinaryTree17-4 2.64s ± 4% 2.56s ± 9% -2.92% (p=0.008 n=9+9)
Fannkuch11-4 2.90s ± 1% 2.95s ± 3% +1.76% (p=0.010 n=10+9)
FmtFprintfEmpty-4 35.3ns ± 1% 34.5ns ± 2% -2.34% (p=0.004 n=9+8)
FmtFprintfString-4 57.0ns ± 1% 58.4ns ± 5% +2.52% (p=0.029 n=9+10)
FmtFprintfInt-4 59.8ns ± 3% 59.8ns ± 6% ~ (p=0.565 n=10+10)
FmtFprintfIntInt-4 93.9ns ± 3% 91.2ns ± 5% -2.94% (p=0.014 n=10+9)
FmtFprintfPrefixedInt-4 107ns ± 6% 104ns ± 6% ~ (p=0.099 n=10+10)
FmtFprintfFloat-4 187ns ± 3% 188ns ± 3% ~ (p=0.505 n=10+9)
FmtManyArgs-4 410ns ± 1% 415ns ± 6% ~ (p=0.649 n=8+10)
GobDecode-4 5.30ms ± 3% 5.27ms ± 3% ~ (p=0.436 n=10+10)
GobEncode-4 4.62ms ± 5% 4.47ms ± 2% -3.24% (p=0.001 n=9+10)
Gzip-4 197ms ± 4% 193ms ± 3% ~ (p=0.123 n=10+10)
Gunzip-4 30.4ms ± 3% 30.1ms ± 3% ~ (p=0.481 n=10+10)
HTTPClientServer-4 76.3µs ± 1% 76.0µs ± 1% ~ (p=0.236 n=8+9)
JSONEncode-4 10.5ms ± 9% 10.3ms ± 3% ~ (p=0.280 n=10+10)
JSONDecode-4 42.3ms ±10% 41.3ms ± 2% ~ (p=0.053 n=9+10)
Mandelbrot200-4 3.80ms ± 2% 3.72ms ± 2% -2.15% (p=0.001 n=9+10)
GoParse-4 2.88ms ±10% 2.81ms ± 2% ~ (p=0.247 n=10+10)
RegexpMatchEasy0_32-4 69.5ns ± 4% 68.6ns ± 2% ~ (p=0.171 n=10+10)
RegexpMatchEasy0_1K-4 165ns ± 3% 162ns ± 3% ~ (p=0.137 n=10+10)
RegexpMatchEasy1_32-4 65.7ns ± 6% 64.4ns ± 2% -2.02% (p=0.037 n=10+10)
RegexpMatchEasy1_1K-4 278ns ± 2% 279ns ± 3% ~ (p=0.991 n=8+9)
RegexpMatchMedium_32-4 99.3ns ± 3% 98.5ns ± 4% ~ (p=0.457 n=10+9)
RegexpMatchMedium_1K-4 30.1µs ± 1% 30.4µs ± 2% ~ (p=0.173 n=8+10)
RegexpMatchHard_32-4 1.40µs ± 2% 1.41µs ± 4% ~ (p=0.565 n=10+10)
RegexpMatchHard_1K-4 42.5µs ± 1% 41.5µs ± 3% -2.13% (p=0.002 n=8+9)
Revcomp-4 332ms ± 4% 328ms ± 5% ~ (p=0.720 n=9+10)
Template-4 48.3ms ± 2% 49.6ms ± 3% +2.56% (p=0.002 n=8+10)
TimeParse-4 252ns ± 2% 249ns ± 3% ~ (p=0.116 n=9+10)
TimeFormat-4 262ns ± 4% 252ns ± 3% -4.01% (p=0.000 n=9+10)
name old speed new speed delta
GobDecode-4 145MB/s ± 3% 146MB/s ± 3% ~ (p=0.436 n=10+10)
GobEncode-4 166MB/s ± 5% 172MB/s ± 2% +3.28% (p=0.001 n=9+10)
Gzip-4 98.6MB/s ± 4% 100.4MB/s ± 3% ~ (p=0.123 n=10+10)
Gunzip-4 639MB/s ± 3% 645MB/s ± 3% ~ (p=0.481 n=10+10)
JSONEncode-4 185MB/s ± 8% 189MB/s ± 3% ~ (p=0.280 n=10+10)
JSONDecode-4 46.0MB/s ± 9% 47.0MB/s ± 2% +2.21% (p=0.046 n=9+10)
GoParse-4 20.1MB/s ± 9% 20.6MB/s ± 2% ~ (p=0.239 n=10+10)
RegexpMatchEasy0_32-4 460MB/s ± 4% 467MB/s ± 2% ~ (p=0.165 n=10+10)
RegexpMatchEasy0_1K-4 6.19GB/s ± 3% 6.28GB/s ± 3% ~ (p=0.165 n=10+10)
RegexpMatchEasy1_32-4 487MB/s ± 5% 497MB/s ± 2% +2.00% (p=0.043 n=10+10)
RegexpMatchEasy1_1K-4 3.67GB/s ± 2% 3.67GB/s ± 3% ~ (p=0.963 n=8+9)
RegexpMatchMedium_32-4 10.1MB/s ± 3% 10.1MB/s ± 4% ~ (p=0.435 n=10+9)
RegexpMatchMedium_1K-4 34.0MB/s ± 1% 33.7MB/s ± 2% ~ (p=0.173 n=8+10)
RegexpMatchHard_32-4 22.9MB/s ± 2% 22.7MB/s ± 4% ~ (p=0.565 n=10+10)
RegexpMatchHard_1K-4 24.0MB/s ± 3% 24.7MB/s ± 3% +2.64% (p=0.001 n=9+9)
Revcomp-4 766MB/s ± 4% 775MB/s ± 5% ~ (p=0.720 n=9+10)
Template-4 40.2MB/s ± 2% 39.2MB/s ± 3% -2.47% (p=0.002 n=8+10)
The rules match ~1800 times during all.bash.
Fixes #18943
Change-Id: I64be1ada34e89c486dfd935bf429b35652117ed4
Reviewed-on: https://go-review.googlesource.com/94766
Run-TryBot: Giovanni Bajo <rasky@develer.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2018-02-17 05:54:03 -07:00
|
|
|
/************************************
|
|
|
|
* 64-bit instructions
|
|
|
|
************************************/
|
|
|
|
|
|
|
|
func bitcheck64_constleft(a uint64) (n int) {
|
|
|
|
// amd64:"BTQ\t[$]63"
|
|
|
|
if a&(1<<63) != 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
// amd64:"BTQ\t[$]60"
|
|
|
|
if a&(1<<60) != 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
// amd64:"BTL\t[$]0"
|
|
|
|
if a&(1<<0) != 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
|
|
|
|
func bitcheck64_constright(a [8]uint64) (n int) {
|
|
|
|
// amd64:"BTQ\t[$]63"
|
|
|
|
if (a[0]>>63)&1 != 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
// amd64:"BTQ\t[$]63"
|
|
|
|
if a[1]>>63 != 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
// amd64:"BTQ\t[$]63"
|
|
|
|
if a[2]>>63 == 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
// amd64:"BTQ\t[$]60"
|
|
|
|
if (a[3]>>60)&1 == 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
// amd64:"BTL\t[$]1"
|
|
|
|
if (a[4]>>1)&1 == 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
// amd64:"BTL\t[$]0"
|
|
|
|
if (a[5]>>0)&1 == 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
// amd64:"BTL\t[$]7"
|
|
|
|
if (a[6]>>5)&4 == 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
|
|
|
|
func bitcheck64_var(a, b uint64) (n int) {
|
|
|
|
// amd64:"BTQ"
|
|
|
|
if a&(1<<(b&63)) != 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
// amd64:"BTQ",-"BT.\t[$]0"
|
|
|
|
if (b>>(a&63))&1 != 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
|
|
|
|
func bitcheck64_mask(a uint64) (n int) {
|
|
|
|
// amd64:"BTQ\t[$]63"
|
|
|
|
if a&0x8000000000000000 != 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
// amd64:"BTQ\t[$]59"
|
|
|
|
if a&0x800000000000000 != 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
// amd64:"BTL\t[$]0"
|
|
|
|
if a&0x1 != 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
|
|
|
|
func biton64(a, b uint64) (n uint64) {
|
|
|
|
// amd64:"BTSQ"
|
|
|
|
n += b | (1 << (a & 63))
|
|
|
|
|
|
|
|
// amd64:"BTSQ\t[$]63"
|
|
|
|
n += a | (1 << 63)
|
|
|
|
|
|
|
|
// amd64:"BTSQ\t[$]60"
|
|
|
|
n += a | (1 << 60)
|
|
|
|
|
|
|
|
// amd64:"ORQ\t[$]1"
|
|
|
|
n += a | (1 << 0)
|
|
|
|
|
|
|
|
return n
|
|
|
|
}
|
|
|
|
|
|
|
|
func bitoff64(a, b uint64) (n uint64) {
|
|
|
|
// amd64:"BTRQ"
|
|
|
|
n += b &^ (1 << (a & 63))
|
|
|
|
|
|
|
|
// amd64:"BTRQ\t[$]63"
|
|
|
|
n += a &^ (1 << 63)
|
|
|
|
|
|
|
|
// amd64:"BTRQ\t[$]60"
|
|
|
|
n += a &^ (1 << 60)
|
|
|
|
|
|
|
|
// amd64:"ANDQ\t[$]-2"
|
|
|
|
n += a &^ (1 << 0)
|
|
|
|
|
|
|
|
return n
|
|
|
|
}
|
|
|
|
|
|
|
|
func bitcompl64(a, b uint64) (n uint64) {
|
|
|
|
// amd64:"BTCQ"
|
|
|
|
n += b ^ (1 << (a & 63))
|
|
|
|
|
|
|
|
// amd64:"BTCQ\t[$]63"
|
|
|
|
n += a ^ (1 << 63)
|
|
|
|
|
|
|
|
// amd64:"BTCQ\t[$]60"
|
|
|
|
n += a ^ (1 << 60)
|
|
|
|
|
|
|
|
// amd64:"XORQ\t[$]1"
|
|
|
|
n += a ^ (1 << 0)
|
|
|
|
|
|
|
|
return n
|
|
|
|
}
|
|
|
|
|
|
|
|
/************************************
|
|
|
|
* 32-bit instructions
|
|
|
|
************************************/
|
|
|
|
|
|
|
|
func bitcheck32_constleft(a uint32) (n int) {
|
|
|
|
// amd64:"BTL\t[$]31"
|
|
|
|
if a&(1<<31) != 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
// amd64:"BTL\t[$]28"
|
|
|
|
if a&(1<<28) != 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
// amd64:"BTL\t[$]0"
|
|
|
|
if a&(1<<0) != 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
|
|
|
|
func bitcheck32_constright(a [8]uint32) (n int) {
|
|
|
|
// amd64:"BTL\t[$]31"
|
|
|
|
if (a[0]>>31)&1 != 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
// amd64:"BTL\t[$]31"
|
|
|
|
if a[1]>>31 != 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
// amd64:"BTL\t[$]31"
|
|
|
|
if a[2]>>31 == 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
// amd64:"BTL\t[$]28"
|
|
|
|
if (a[3]>>28)&1 == 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
// amd64:"BTL\t[$]1"
|
|
|
|
if (a[4]>>1)&1 == 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
// amd64:"BTL\t[$]0"
|
|
|
|
if (a[5]>>0)&1 == 0 {
|
2018-02-26 17:59:58 -07:00
|
|
|
return 1
|
|
|
|
}
|
cmd/compile: add patterns for bit set/clear/complement on amd64
This patch completes implementation of BT(Q|L), and adds support
for BT(S|R|C)(Q|L).
Example of code changes from time.(*Time).addSec:
if t.wall&hasMonotonic != 0 {
0x1073465 488b08 MOVQ 0(AX), CX
0x1073468 4889ca MOVQ CX, DX
0x107346b 48c1e93f SHRQ $0x3f, CX
0x107346f 48c1e13f SHLQ $0x3f, CX
0x1073473 48f7c1ffffffff TESTQ $-0x1, CX
0x107347a 746b JE 0x10734e7
if t.wall&hasMonotonic != 0 {
0x1073435 488b08 MOVQ 0(AX), CX
0x1073438 480fbae13f BTQ $0x3f, CX
0x107343d 7363 JAE 0x10734a2
Another example:
t.wall = t.wall&nsecMask | uint64(dsec)<<nsecShift | hasMonotonic
0x10734c8 4881e1ffffff3f ANDQ $0x3fffffff, CX
0x10734cf 48c1e61e SHLQ $0x1e, SI
0x10734d3 4809ce ORQ CX, SI
0x10734d6 48b90000000000000080 MOVQ $0x8000000000000000, CX
0x10734e0 4809f1 ORQ SI, CX
0x10734e3 488908 MOVQ CX, 0(AX)
t.wall = t.wall&nsecMask | uint64(dsec)<<nsecShift | hasMonotonic
0x107348b 4881e2ffffff3f ANDQ $0x3fffffff, DX
0x1073492 48c1e61e SHLQ $0x1e, SI
0x1073496 4809f2 ORQ SI, DX
0x1073499 480fbaea3f BTSQ $0x3f, DX
0x107349e 488910 MOVQ DX, 0(AX)
Go1 benchmarks seem unaffected, and I would be surprised
otherwise:
name old time/op new time/op delta
BinaryTree17-4 2.64s ± 4% 2.56s ± 9% -2.92% (p=0.008 n=9+9)
Fannkuch11-4 2.90s ± 1% 2.95s ± 3% +1.76% (p=0.010 n=10+9)
FmtFprintfEmpty-4 35.3ns ± 1% 34.5ns ± 2% -2.34% (p=0.004 n=9+8)
FmtFprintfString-4 57.0ns ± 1% 58.4ns ± 5% +2.52% (p=0.029 n=9+10)
FmtFprintfInt-4 59.8ns ± 3% 59.8ns ± 6% ~ (p=0.565 n=10+10)
FmtFprintfIntInt-4 93.9ns ± 3% 91.2ns ± 5% -2.94% (p=0.014 n=10+9)
FmtFprintfPrefixedInt-4 107ns ± 6% 104ns ± 6% ~ (p=0.099 n=10+10)
FmtFprintfFloat-4 187ns ± 3% 188ns ± 3% ~ (p=0.505 n=10+9)
FmtManyArgs-4 410ns ± 1% 415ns ± 6% ~ (p=0.649 n=8+10)
GobDecode-4 5.30ms ± 3% 5.27ms ± 3% ~ (p=0.436 n=10+10)
GobEncode-4 4.62ms ± 5% 4.47ms ± 2% -3.24% (p=0.001 n=9+10)
Gzip-4 197ms ± 4% 193ms ± 3% ~ (p=0.123 n=10+10)
Gunzip-4 30.4ms ± 3% 30.1ms ± 3% ~ (p=0.481 n=10+10)
HTTPClientServer-4 76.3µs ± 1% 76.0µs ± 1% ~ (p=0.236 n=8+9)
JSONEncode-4 10.5ms ± 9% 10.3ms ± 3% ~ (p=0.280 n=10+10)
JSONDecode-4 42.3ms ±10% 41.3ms ± 2% ~ (p=0.053 n=9+10)
Mandelbrot200-4 3.80ms ± 2% 3.72ms ± 2% -2.15% (p=0.001 n=9+10)
GoParse-4 2.88ms ±10% 2.81ms ± 2% ~ (p=0.247 n=10+10)
RegexpMatchEasy0_32-4 69.5ns ± 4% 68.6ns ± 2% ~ (p=0.171 n=10+10)
RegexpMatchEasy0_1K-4 165ns ± 3% 162ns ± 3% ~ (p=0.137 n=10+10)
RegexpMatchEasy1_32-4 65.7ns ± 6% 64.4ns ± 2% -2.02% (p=0.037 n=10+10)
RegexpMatchEasy1_1K-4 278ns ± 2% 279ns ± 3% ~ (p=0.991 n=8+9)
RegexpMatchMedium_32-4 99.3ns ± 3% 98.5ns ± 4% ~ (p=0.457 n=10+9)
RegexpMatchMedium_1K-4 30.1µs ± 1% 30.4µs ± 2% ~ (p=0.173 n=8+10)
RegexpMatchHard_32-4 1.40µs ± 2% 1.41µs ± 4% ~ (p=0.565 n=10+10)
RegexpMatchHard_1K-4 42.5µs ± 1% 41.5µs ± 3% -2.13% (p=0.002 n=8+9)
Revcomp-4 332ms ± 4% 328ms ± 5% ~ (p=0.720 n=9+10)
Template-4 48.3ms ± 2% 49.6ms ± 3% +2.56% (p=0.002 n=8+10)
TimeParse-4 252ns ± 2% 249ns ± 3% ~ (p=0.116 n=9+10)
TimeFormat-4 262ns ± 4% 252ns ± 3% -4.01% (p=0.000 n=9+10)
name old speed new speed delta
GobDecode-4 145MB/s ± 3% 146MB/s ± 3% ~ (p=0.436 n=10+10)
GobEncode-4 166MB/s ± 5% 172MB/s ± 2% +3.28% (p=0.001 n=9+10)
Gzip-4 98.6MB/s ± 4% 100.4MB/s ± 3% ~ (p=0.123 n=10+10)
Gunzip-4 639MB/s ± 3% 645MB/s ± 3% ~ (p=0.481 n=10+10)
JSONEncode-4 185MB/s ± 8% 189MB/s ± 3% ~ (p=0.280 n=10+10)
JSONDecode-4 46.0MB/s ± 9% 47.0MB/s ± 2% +2.21% (p=0.046 n=9+10)
GoParse-4 20.1MB/s ± 9% 20.6MB/s ± 2% ~ (p=0.239 n=10+10)
RegexpMatchEasy0_32-4 460MB/s ± 4% 467MB/s ± 2% ~ (p=0.165 n=10+10)
RegexpMatchEasy0_1K-4 6.19GB/s ± 3% 6.28GB/s ± 3% ~ (p=0.165 n=10+10)
RegexpMatchEasy1_32-4 487MB/s ± 5% 497MB/s ± 2% +2.00% (p=0.043 n=10+10)
RegexpMatchEasy1_1K-4 3.67GB/s ± 2% 3.67GB/s ± 3% ~ (p=0.963 n=8+9)
RegexpMatchMedium_32-4 10.1MB/s ± 3% 10.1MB/s ± 4% ~ (p=0.435 n=10+9)
RegexpMatchMedium_1K-4 34.0MB/s ± 1% 33.7MB/s ± 2% ~ (p=0.173 n=8+10)
RegexpMatchHard_32-4 22.9MB/s ± 2% 22.7MB/s ± 4% ~ (p=0.565 n=10+10)
RegexpMatchHard_1K-4 24.0MB/s ± 3% 24.7MB/s ± 3% +2.64% (p=0.001 n=9+9)
Revcomp-4 766MB/s ± 4% 775MB/s ± 5% ~ (p=0.720 n=9+10)
Template-4 40.2MB/s ± 2% 39.2MB/s ± 3% -2.47% (p=0.002 n=8+10)
The rules match ~1800 times during all.bash.
Fixes #18943
Change-Id: I64be1ada34e89c486dfd935bf429b35652117ed4
Reviewed-on: https://go-review.googlesource.com/94766
Run-TryBot: Giovanni Bajo <rasky@develer.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2018-02-17 05:54:03 -07:00
|
|
|
// amd64:"BTL\t[$]7"
|
|
|
|
if (a[6]>>5)&4 == 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
|
|
|
|
func bitcheck32_var(a, b uint32) (n int) {
|
|
|
|
// amd64:"BTL"
|
|
|
|
if a&(1<<(b&31)) != 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
// amd64:"BTL",-"BT.\t[$]0"
|
|
|
|
if (b>>(a&31))&1 != 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
|
|
|
|
func bitcheck32_mask(a uint32) (n int) {
|
|
|
|
// amd64:"BTL\t[$]31"
|
|
|
|
if a&0x80000000 != 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
// amd64:"BTL\t[$]27"
|
|
|
|
if a&0x8000000 != 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
// amd64:"BTL\t[$]0"
|
|
|
|
if a&0x1 != 0 {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
|
|
|
|
func biton32(a, b uint32) (n uint32) {
|
|
|
|
// amd64:"BTSL"
|
|
|
|
n += b | (1 << (a & 31))
|
|
|
|
|
|
|
|
// amd64:"BTSL\t[$]31"
|
|
|
|
n += a | (1 << 31)
|
|
|
|
|
|
|
|
// amd64:"BTSL\t[$]28"
|
|
|
|
n += a | (1 << 28)
|
|
|
|
|
|
|
|
// amd64:"ORL\t[$]1"
|
|
|
|
n += a | (1 << 0)
|
|
|
|
|
|
|
|
return n
|
|
|
|
}
|
|
|
|
|
|
|
|
func bitoff32(a, b uint32) (n uint32) {
|
|
|
|
// amd64:"BTRL"
|
|
|
|
n += b &^ (1 << (a & 31))
|
|
|
|
|
|
|
|
// amd64:"BTRL\t[$]31"
|
|
|
|
n += a &^ (1 << 31)
|
|
|
|
|
|
|
|
// amd64:"BTRL\t[$]28"
|
|
|
|
n += a &^ (1 << 28)
|
|
|
|
|
|
|
|
// amd64:"ANDL\t[$]-2"
|
|
|
|
n += a &^ (1 << 0)
|
|
|
|
|
|
|
|
return n
|
|
|
|
}
|
|
|
|
|
|
|
|
func bitcompl32(a, b uint32) (n uint32) {
|
|
|
|
// amd64:"BTCL"
|
|
|
|
n += b ^ (1 << (a & 31))
|
|
|
|
|
|
|
|
// amd64:"BTCL\t[$]31"
|
|
|
|
n += a ^ (1 << 31)
|
|
|
|
|
|
|
|
// amd64:"BTCL\t[$]28"
|
|
|
|
n += a ^ (1 << 28)
|
|
|
|
|
|
|
|
// amd64:"XORL\t[$]1"
|
|
|
|
n += a ^ (1 << 0)
|
|
|
|
|
|
|
|
return n
|
2018-02-26 17:59:58 -07:00
|
|
|
}
|
2018-04-10 03:20:20 -06:00
|
|
|
|
cmd/compile: fix long RMW bit operations on AMD64
Under certain circumstances, the existing rules for bit operations can
produce code that writes beyond its intended bounds. For example,
consider the following code:
func repro(b []byte, addr, bit int32) {
_ = b[3]
v := uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 | 1<<(bit&31)
b[0] = byte(v)
b[1] = byte(v >> 8)
b[2] = byte(v >> 16)
b[3] = byte(v >> 24)
}
Roughly speaking:
1. The expression `1 << (bit & 31)` is rewritten into `(SHLL 1 bit)`
2. The expression `uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 |
uint32(b[3])<<24` is rewritten into `(MOVLload &b[0])`
3. The statements `b[0] = byte(v) ... b[3] = byte(v >> 24)` are
rewritten into `(MOVLstore &b[0], v)`
4. `(ORL (SHLL 1, bit) (MOVLload &b[0]))` is rewritten into
`(BTSL (MOVLload &b[0]) bit)`. This is a valid transformation because
the destination is a register: in this case, the bit offset is masked
by the number of bits in the destination register. This is identical
to the masking performed by `SHL`.
5. `(MOVLstore &b[0] (BTSL (MOVLload &b[0]) bit))` is rewritten into
`(BTSLmodify &b[0] bit)`. This is an invalid transformation because
the destination is memory: in this case, the bit offset is not
masked, and the chosen instruction may write outside its intended
32-bit location.
These changes fix the invalid rewrite performed in step (5) by
explicitly maksing the bit offset operand to `BT(S|R|C)(L|Q)modify`. In
the example above, the adjusted rules produce
`(BTSLmodify &b[0] (ANDLconst [31] bit))` in step (5).
These changes also add several new rules to rewrite bit sets, toggles,
and clears that are rooted at `(OR|XOR|AND)(L|Q)modify` operators into
appropriate `BT(S|R|C)(L|Q)modify` operators. These rules catch cases
where `MOV(L|Q)store ((OR|XOR|AND)(L|Q) ...)` is rewritten to
`(OR|XOR|AND)(L|Q)modify` before the `(OR|XOR|AND)(L|Q) ...` can be
rewritten to `BT(S|R|C)(L|Q) ...`.
Overall, compilecmp reports small improvements in code size on
darwin/amd64 when the changes to the compiler itself are exlcuded:
file before after Δ %
runtime.s 536464 536412 -52 -0.010%
bytes.s 32629 32593 -36 -0.110%
strings.s 44565 44529 -36 -0.081%
os/signal.s 7967 7959 -8 -0.100%
cmd/vendor/golang.org/x/sys/unix.s 81686 81678 -8 -0.010%
math/big.s 188235 188253 +18 +0.010%
cmd/link/internal/loader.s 89295 89056 -239 -0.268%
cmd/link/internal/ld.s 633551 633232 -319 -0.050%
cmd/link/internal/arm.s 18934 18928 -6 -0.032%
cmd/link/internal/arm64.s 31814 31801 -13 -0.041%
cmd/link/internal/riscv64.s 7347 7345 -2 -0.027%
cmd/compile/internal/ssa.s 4029173 4033066 +3893 +0.097%
total 21298280 21301472 +3192 +0.015%
Change-Id: I2e560548b515865129e1724e150e30540e9d29ce
GitHub-Last-Rev: 9a42bd29a55b3917651aecab6932074df96535ae
GitHub-Pull-Request: golang/go#45242
Reviewed-on: https://go-review.googlesource.com/c/go/+/304869
Reviewed-by: Keith Randall <khr@golang.org>
Trust: Josh Bleecher Snyder <josharian@gmail.com>
2021-03-26 11:48:42 -06:00
|
|
|
// check direct operation on memory with constant and shifted constant sources
|
|
|
|
func bitOpOnMem(a []uint32, b, c, d uint32) {
|
2021-04-12 12:00:49 -06:00
|
|
|
// amd64:`ANDL\s[$]200,\s\([A-Z][A-Z0-9]+\)`
|
2018-06-26 20:46:17 -06:00
|
|
|
a[0] &= 200
|
2021-04-12 12:00:49 -06:00
|
|
|
// amd64:`ORL\s[$]220,\s4\([A-Z][A-Z0-9]+\)`
|
2018-06-26 20:46:17 -06:00
|
|
|
a[1] |= 220
|
2021-04-12 12:00:49 -06:00
|
|
|
// amd64:`XORL\s[$]240,\s8\([A-Z][A-Z0-9]+\)`
|
2018-06-26 20:46:17 -06:00
|
|
|
a[2] ^= 240
|
|
|
|
}
|
|
|
|
|
2020-04-20 15:43:30 -06:00
|
|
|
func bitcheckMostNegative(b uint8) bool {
|
|
|
|
// amd64:"TESTB"
|
|
|
|
return b&0x80 == 0x80
|
|
|
|
}
|
|
|
|
|
2018-04-10 03:20:20 -06:00
|
|
|
// Check AND masking on arm64 (Issue #19857)
|
|
|
|
|
|
|
|
func and_mask_1(a uint64) uint64 {
|
|
|
|
// arm64:`AND\t`
|
|
|
|
return a & ((1 << 63) - 1)
|
|
|
|
}
|
|
|
|
|
|
|
|
func and_mask_2(a uint64) uint64 {
|
|
|
|
// arm64:`AND\t`
|
|
|
|
return a & (1 << 63)
|
|
|
|
}
|
|
|
|
|
2018-09-10 02:29:52 -06:00
|
|
|
func and_mask_3(a, b uint32) (uint32, uint32) {
|
2018-07-10 19:30:32 -06:00
|
|
|
// arm/7:`BIC`,-`AND`
|
2018-09-10 02:29:52 -06:00
|
|
|
a &= 0xffffaaaa
|
|
|
|
// arm/7:`BFC`,-`AND`,-`BIC`
|
|
|
|
b &= 0xffc003ff
|
|
|
|
return a, b
|
2018-07-10 19:30:32 -06:00
|
|
|
}
|
|
|
|
|
2018-04-10 03:20:20 -06:00
|
|
|
// Check generation of arm64 BIC/EON/ORN instructions
|
|
|
|
|
|
|
|
func op_bic(x, y uint32) uint32 {
|
|
|
|
// arm64:`BIC\t`,-`AND`
|
|
|
|
return x &^ y
|
|
|
|
}
|
|
|
|
|
2020-06-04 21:53:53 -06:00
|
|
|
func op_eon(x, y, z uint32, a []uint32, n, m uint64) uint64 {
|
|
|
|
// arm64:`EON\t`,-`EOR`,-`MVN`
|
|
|
|
a[0] = x ^ (y ^ 0xffffffff)
|
|
|
|
|
|
|
|
// arm64:`EON\t`,-`EOR`,-`MVN`
|
|
|
|
a[1] = ^(y ^ z)
|
|
|
|
|
2018-04-10 03:20:20 -06:00
|
|
|
// arm64:`EON\t`,-`XOR`
|
2020-06-04 21:53:53 -06:00
|
|
|
a[2] = x ^ ^z
|
|
|
|
|
|
|
|
// arm64:`EON\t`,-`EOR`,-`MVN`
|
|
|
|
return n ^ (m ^ 0xffffffffffffffff)
|
2018-04-10 03:20:20 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
func op_orn(x, y uint32) uint32 {
|
|
|
|
// arm64:`ORN\t`,-`ORR`
|
|
|
|
return x | ^y
|
|
|
|
}
|
2019-05-08 04:02:23 -06:00
|
|
|
|
|
|
|
// check bitsets
|
|
|
|
func bitSetPowerOf2Test(x int) bool {
|
|
|
|
// amd64:"BTL\t[$]3"
|
|
|
|
return x&8 == 8
|
|
|
|
}
|
|
|
|
|
|
|
|
func bitSetTest(x int) bool {
|
2021-10-10 09:56:16 -06:00
|
|
|
// amd64:"ANDL\t[$]9, AX"
|
2019-05-08 04:02:23 -06:00
|
|
|
// amd64:"CMPQ\tAX, [$]9"
|
|
|
|
return x&9 == 9
|
|
|
|
}
|
2020-05-11 10:44:48 -06:00
|
|
|
|
|
|
|
// mask contiguous one bits
|
|
|
|
func cont1Mask64U(x uint64) uint64 {
|
|
|
|
// s390x:"RISBGZ\t[$]16, [$]47, [$]0,"
|
2020-11-08 01:44:33 -07:00
|
|
|
return x & 0x0000ffffffff0000
|
2020-05-11 10:44:48 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
// mask contiguous zero bits
|
|
|
|
func cont0Mask64U(x uint64) uint64 {
|
|
|
|
// s390x:"RISBGZ\t[$]48, [$]15, [$]0,"
|
2020-11-08 01:44:33 -07:00
|
|
|
return x & 0xffff00000000ffff
|
2020-05-11 10:44:48 -06:00
|
|
|
}
|
2021-02-12 16:55:25 -07:00
|
|
|
|
|
|
|
func issue44228a(a []int64, i int) bool {
|
|
|
|
// amd64: "BTQ", -"SHL"
|
|
|
|
return a[i>>6]&(1<<(i&63)) != 0
|
|
|
|
}
|
|
|
|
func issue44228b(a []int32, i int) bool {
|
|
|
|
// amd64: "BTL", -"SHL"
|
|
|
|
return a[i>>5]&(1<<(i&31)) != 0
|
|
|
|
}
|
2021-09-19 10:23:37 -06:00
|
|
|
|
|
|
|
func issue48467(x, y uint64) uint64 {
|
|
|
|
// arm64: -"NEG"
|
|
|
|
d, borrow := bits.Sub64(x, y, 0)
|
|
|
|
return x - d&(-borrow)
|
|
|
|
}
|
cmd/compile: add late lower pass for last rules to run
Usually optimization rules have corresponding priorities, some need to
be run first, some run next, and some run last, which produces the best
code. But currently our optimization rules have no priority, this CL
adds a late lower pass that runs those rules that need to be run at last,
such as split unreasonable constant folding. This pass can be seen as
the second round of the lower pass.
For example:
func foo(a, b uint64) uint64 {
d := a+0x1234568
d1 := b+0x1234568
return d&d1
}
The code generated by the master branch:
0x0004 00004 ADD $19088744, R0, R2 // movz+movk+add
0x0010 00016 ADD $19088744, R1, R1 // movz+movk+add
0x001c 00028 AND R1, R2, R0
This is because the current constant folding optimization rules do not
take into account the range of constants, causing the constant to be
loaded repeatedly. This CL splits these unreasonable constants folding
in the late lower pass. With this CL the generated code:
0x0004 00004 MOVD $19088744, R2 // movz+movk
0x000c 00012 ADD R0, R2, R3
0x0010 00016 ADD R1, R2, R1
0x0014 00020 AND R1, R3, R0
This CL also adds constant folding optimization for ADDS instruction.
In addition, in order not to introduce the codegen regression, an
optimization rule is added to change the addition of a negative number
into a subtraction of a positive number.
go1 benchmarks:
name old time/op new time/op delta
BinaryTree17-8 1.22s ± 1% 1.24s ± 0% +1.56% (p=0.008 n=5+5)
Fannkuch11-8 1.54s ± 0% 1.53s ± 0% -0.69% (p=0.016 n=4+5)
FmtFprintfEmpty-8 14.1ns ± 0% 14.1ns ± 0% ~ (p=0.079 n=4+5)
FmtFprintfString-8 26.0ns ± 0% 26.1ns ± 0% +0.23% (p=0.008 n=5+5)
FmtFprintfInt-8 32.3ns ± 0% 32.9ns ± 1% +1.72% (p=0.008 n=5+5)
FmtFprintfIntInt-8 54.5ns ± 0% 55.5ns ± 0% +1.83% (p=0.008 n=5+5)
FmtFprintfPrefixedInt-8 61.5ns ± 0% 62.0ns ± 0% +0.93% (p=0.008 n=5+5)
FmtFprintfFloat-8 72.0ns ± 0% 73.6ns ± 0% +2.24% (p=0.008 n=5+5)
FmtManyArgs-8 221ns ± 0% 224ns ± 0% +1.22% (p=0.008 n=5+5)
GobDecode-8 1.91ms ± 0% 1.93ms ± 0% +0.98% (p=0.008 n=5+5)
GobEncode-8 1.40ms ± 1% 1.39ms ± 0% -0.79% (p=0.032 n=5+5)
Gzip-8 115ms ± 0% 117ms ± 1% +1.17% (p=0.008 n=5+5)
Gunzip-8 19.4ms ± 1% 19.3ms ± 0% -0.71% (p=0.016 n=5+4)
HTTPClientServer-8 27.0µs ± 0% 27.3µs ± 0% +0.80% (p=0.008 n=5+5)
JSONEncode-8 3.36ms ± 1% 3.33ms ± 0% ~ (p=0.056 n=5+5)
JSONDecode-8 17.5ms ± 2% 17.8ms ± 0% +1.71% (p=0.016 n=5+4)
Mandelbrot200-8 2.29ms ± 0% 2.29ms ± 0% ~ (p=0.151 n=5+5)
GoParse-8 1.35ms ± 1% 1.36ms ± 1% ~ (p=0.056 n=5+5)
RegexpMatchEasy0_32-8 24.5ns ± 0% 24.5ns ± 0% ~ (p=0.444 n=4+5)
RegexpMatchEasy0_1K-8 131ns ±11% 118ns ± 6% ~ (p=0.056 n=5+5)
RegexpMatchEasy1_32-8 22.9ns ± 0% 22.9ns ± 0% ~ (p=0.905 n=4+5)
RegexpMatchEasy1_1K-8 126ns ± 0% 127ns ± 0% ~ (p=0.063 n=4+5)
RegexpMatchMedium_32-8 486ns ± 5% 483ns ± 0% ~ (p=0.381 n=5+4)
RegexpMatchMedium_1K-8 15.4µs ± 1% 15.5µs ± 0% ~ (p=0.151 n=5+5)
RegexpMatchHard_32-8 687ns ± 0% 686ns ± 0% ~ (p=0.103 n=5+5)
RegexpMatchHard_1K-8 20.7µs ± 0% 20.7µs ± 1% ~ (p=0.151 n=5+5)
Revcomp-8 175ms ± 2% 176ms ± 3% ~ (p=1.000 n=5+5)
Template-8 20.4ms ± 6% 20.1ms ± 2% ~ (p=0.151 n=5+5)
TimeParse-8 112ns ± 0% 113ns ± 0% +0.97% (p=0.016 n=5+4)
TimeFormat-8 156ns ± 0% 145ns ± 0% -7.14% (p=0.029 n=4+4)
Change-Id: I3ced26e89041f873ac989586514ccc5ee09f13da
Reviewed-on: https://go-review.googlesource.com/c/go/+/425134
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
Run-TryBot: Eric Fang <eric.fang@arm.com>
2022-08-17 04:01:17 -06:00
|
|
|
|
|
|
|
func foldConst(x, y uint64) uint64 {
|
|
|
|
// arm64: "ADDS\t[$]7",-"MOVD\t[$]7"
|
|
|
|
d, b := bits.Add64(x, 7, 0)
|
|
|
|
return b & d
|
|
|
|
}
|
|
|
|
|
|
|
|
func foldConstOutOfRange(a uint64) uint64 {
|
|
|
|
// arm64: "MOVD\t[$]19088744",-"ADD\t[$]19088744"
|
|
|
|
return a + 0x1234568
|
|
|
|
}
|