1
0
mirror of https://github.com/golang/go synced 2024-11-17 22:44:41 -07:00
go/test/codegen/bitfield.go

369 lines
9.6 KiB
Go
Raw Normal View History

cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
// asmcheck
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package codegen
// This file contains codegen tests related to bit field
// insertion/extraction simplifications/optimizations.
func extr1(x, x2 uint64) uint64 {
return x<<7 + x2>>57 // arm64:"EXTR\t[$]57,"
}
func extr2(x, x2 uint64) uint64 {
return x<<7 | x2>>57 // arm64:"EXTR\t[$]57,"
}
func extr3(x, x2 uint64) uint64 {
return x<<7 ^ x2>>57 // arm64:"EXTR\t[$]57,"
}
func extr4(x, x2 uint32) uint32 {
return x<<7 + x2>>25 // arm64:"EXTRW\t[$]25,"
}
func extr5(x, x2 uint32) uint32 {
return x<<7 | x2>>25 // arm64:"EXTRW\t[$]25,"
}
func extr6(x, x2 uint32) uint32 {
return x<<7 ^ x2>>25 // arm64:"EXTRW\t[$]25,"
}
// check 32-bit shift masking
func mask32(x uint32) uint32 {
return (x << 29) >> 29 // arm64:"AND\t[$]7, R[0-9]+",-"LSR",-"LSL"
}
// check 16-bit shift masking
func mask16(x uint16) uint16 {
return (x << 14) >> 14 // arm64:"AND\t[$]3, R[0-9]+",-"LSR",-"LSL"
}
// check 8-bit shift masking
func mask8(x uint8) uint8 {
return (x << 7) >> 7 // arm64:"AND\t[$]1, R[0-9]+",-"LSR",-"LSL"
}
func maskshift(x uint64) uint64 {
// arm64:"AND\t[$]4095, R[0-9]+",-"LSL",-"LSR",-"UBFIZ",-"UBFX"
return ((x << 5) & (0xfff << 5)) >> 5
}
// bitfield ops
// bfi
func bfi1(x, y uint64) uint64 {
// arm64:"BFI\t[$]4, R[0-9]+, [$]12",-"LSL",-"LSR",-"AND"
return ((x & 0xfff) << 4) | (y & 0xffffffffffff000f)
}
func bfi2(x, y uint64) uint64 {
// arm64:"BFI\t[$]12, R[0-9]+, [$]40",-"LSL",-"LSR",-"AND"
return (x << 24 >> 12) | (y & 0xfff0000000000fff)
}
// bfxil
func bfxil1(x, y uint64) uint64 {
// arm64:"BFXIL\t[$]5, R[0-9]+, [$]12",-"LSL",-"LSR",-"AND"
return ((x >> 5) & 0xfff) | (y & 0xfffffffffffff000)
}
func bfxil2(x, y uint64) uint64 {
// arm64:"BFXIL\t[$]12, R[0-9]+, [$]40",-"LSL",-"LSR",-"AND"
return (x << 12 >> 24) | (y & 0xffffff0000000000)
}
// sbfiz
cmd/compile: simiplify arm64 bitfield optimizations In some rewrite rules for arm64 bitfield optimizations, the bitfield lsb value and the bitfield width value are related to datasize, some of them use datasize directly to check the bitfield lsb value is valid, to get the bitfiled width value, but some of them call isARM64BFMask() and arm64BFWidth() functions. In order to be consistent, this patch changes them all to use datasize. Besides, this patch sorts the codegen test cases. Run the "toolstash-check -all" command and find one inconsistent code is as the following. new: src/math/fma.go:104 BEQ 247 master: src/math/fma.go:104 BEQ 248 The above inconsistence is due to this patch changing the range of the field lsb value in "UBFIZ" optimization rules from "lc+(32|16|8)<64" to "lc<64", so that the following code is generated as "UBFIZ". The logical of changed code is still correct. The code of src/math/fma.go:160: const uvinf = 0x7FF0000000000000 func FMA(a, b uint32) float64 { ps := a+b return Float64frombits(uint64(ps)<<63 | uvinf) } The new assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 UBFIZ $63, R0, $1, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) The master assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 MOVWU R0, R0 LSL $63, R0, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) Change-Id: I9061104adfdfd3384d0525327ae1e5c8b0df5c35 Reviewed-on: https://go-review.googlesource.com/c/go/+/265038 Trust: fannie zhang <Fannie.Zhang@arm.com> Run-TryBot: fannie zhang <Fannie.Zhang@arm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2020-10-21 04:51:42 -06:00
// merge shifts into sbfiz: (x << lc) >> rc && lc > rc.
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
func sbfiz1(x int64) int64 {
// arm64:"SBFIZ\t[$]1, R[0-9]+, [$]60",-"LSL",-"ASR"
return (x << 4) >> 3
}
cmd/compile: simiplify arm64 bitfield optimizations In some rewrite rules for arm64 bitfield optimizations, the bitfield lsb value and the bitfield width value are related to datasize, some of them use datasize directly to check the bitfield lsb value is valid, to get the bitfiled width value, but some of them call isARM64BFMask() and arm64BFWidth() functions. In order to be consistent, this patch changes them all to use datasize. Besides, this patch sorts the codegen test cases. Run the "toolstash-check -all" command and find one inconsistent code is as the following. new: src/math/fma.go:104 BEQ 247 master: src/math/fma.go:104 BEQ 248 The above inconsistence is due to this patch changing the range of the field lsb value in "UBFIZ" optimization rules from "lc+(32|16|8)<64" to "lc<64", so that the following code is generated as "UBFIZ". The logical of changed code is still correct. The code of src/math/fma.go:160: const uvinf = 0x7FF0000000000000 func FMA(a, b uint32) float64 { ps := a+b return Float64frombits(uint64(ps)<<63 | uvinf) } The new assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 UBFIZ $63, R0, $1, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) The master assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 MOVWU R0, R0 LSL $63, R0, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) Change-Id: I9061104adfdfd3384d0525327ae1e5c8b0df5c35 Reviewed-on: https://go-review.googlesource.com/c/go/+/265038 Trust: fannie zhang <Fannie.Zhang@arm.com> Run-TryBot: fannie zhang <Fannie.Zhang@arm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2020-10-21 04:51:42 -06:00
// merge shift and sign-extension into sbfiz.
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
func sbfiz2(x int32) int64 {
return int64(x << 3) // arm64:"SBFIZ\t[$]3, R[0-9]+, [$]29",-"LSL"
}
func sbfiz3(x int16) int64 {
return int64(x << 3) // arm64:"SBFIZ\t[$]3, R[0-9]+, [$]13",-"LSL"
}
func sbfiz4(x int8) int64 {
return int64(x << 3) // arm64:"SBFIZ\t[$]3, R[0-9]+, [$]5",-"LSL"
}
cmd/compile: simiplify arm64 bitfield optimizations In some rewrite rules for arm64 bitfield optimizations, the bitfield lsb value and the bitfield width value are related to datasize, some of them use datasize directly to check the bitfield lsb value is valid, to get the bitfiled width value, but some of them call isARM64BFMask() and arm64BFWidth() functions. In order to be consistent, this patch changes them all to use datasize. Besides, this patch sorts the codegen test cases. Run the "toolstash-check -all" command and find one inconsistent code is as the following. new: src/math/fma.go:104 BEQ 247 master: src/math/fma.go:104 BEQ 248 The above inconsistence is due to this patch changing the range of the field lsb value in "UBFIZ" optimization rules from "lc+(32|16|8)<64" to "lc<64", so that the following code is generated as "UBFIZ". The logical of changed code is still correct. The code of src/math/fma.go:160: const uvinf = 0x7FF0000000000000 func FMA(a, b uint32) float64 { ps := a+b return Float64frombits(uint64(ps)<<63 | uvinf) } The new assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 UBFIZ $63, R0, $1, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) The master assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 MOVWU R0, R0 LSL $63, R0, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) Change-Id: I9061104adfdfd3384d0525327ae1e5c8b0df5c35 Reviewed-on: https://go-review.googlesource.com/c/go/+/265038 Trust: fannie zhang <Fannie.Zhang@arm.com> Run-TryBot: fannie zhang <Fannie.Zhang@arm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2020-10-21 04:51:42 -06:00
// sbfiz combinations.
// merge shift with sbfiz into sbfiz.
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
func sbfiz5(x int32) int32 {
// arm64:"SBFIZ\t[$]1, R[0-9]+, [$]28",-"LSL",-"ASR"
return (x << 4) >> 3
}
func sbfiz6(x int16) int64 {
return int64(x+1) << 3 // arm64:"SBFIZ\t[$]3, R[0-9]+, [$]16",-"LSL"
}
func sbfiz7(x int8) int64 {
return int64(x+1) << 62 // arm64:"SBFIZ\t[$]62, R[0-9]+, [$]2",-"LSL"
}
func sbfiz8(x int32) int64 {
return int64(x+1) << 40 // arm64:"SBFIZ\t[$]40, R[0-9]+, [$]24",-"LSL"
}
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
// sbfx
cmd/compile: simiplify arm64 bitfield optimizations In some rewrite rules for arm64 bitfield optimizations, the bitfield lsb value and the bitfield width value are related to datasize, some of them use datasize directly to check the bitfield lsb value is valid, to get the bitfiled width value, but some of them call isARM64BFMask() and arm64BFWidth() functions. In order to be consistent, this patch changes them all to use datasize. Besides, this patch sorts the codegen test cases. Run the "toolstash-check -all" command and find one inconsistent code is as the following. new: src/math/fma.go:104 BEQ 247 master: src/math/fma.go:104 BEQ 248 The above inconsistence is due to this patch changing the range of the field lsb value in "UBFIZ" optimization rules from "lc+(32|16|8)<64" to "lc<64", so that the following code is generated as "UBFIZ". The logical of changed code is still correct. The code of src/math/fma.go:160: const uvinf = 0x7FF0000000000000 func FMA(a, b uint32) float64 { ps := a+b return Float64frombits(uint64(ps)<<63 | uvinf) } The new assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 UBFIZ $63, R0, $1, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) The master assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 MOVWU R0, R0 LSL $63, R0, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) Change-Id: I9061104adfdfd3384d0525327ae1e5c8b0df5c35 Reviewed-on: https://go-review.googlesource.com/c/go/+/265038 Trust: fannie zhang <Fannie.Zhang@arm.com> Run-TryBot: fannie zhang <Fannie.Zhang@arm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2020-10-21 04:51:42 -06:00
// merge shifts into sbfx: (x << lc) >> rc && lc <= rc.
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
func sbfx1(x int64) int64 {
return (x << 3) >> 4 // arm64:"SBFX\t[$]1, R[0-9]+, [$]60",-"LSL",-"ASR"
}
func sbfx2(x int64) int64 {
return (x << 60) >> 60 // arm64:"SBFX\t[$]0, R[0-9]+, [$]4",-"LSL",-"ASR"
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
}
// merge shift and sign-extension into sbfx.
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
func sbfx3(x int32) int64 {
return int64(x) >> 3 // arm64:"SBFX\t[$]3, R[0-9]+, [$]29",-"ASR"
}
func sbfx4(x int16) int64 {
return int64(x) >> 3 // arm64:"SBFX\t[$]3, R[0-9]+, [$]13",-"ASR"
}
func sbfx5(x int8) int64 {
return int64(x) >> 3 // arm64:"SBFX\t[$]3, R[0-9]+, [$]5",-"ASR"
}
cmd/compile: simiplify arm64 bitfield optimizations In some rewrite rules for arm64 bitfield optimizations, the bitfield lsb value and the bitfield width value are related to datasize, some of them use datasize directly to check the bitfield lsb value is valid, to get the bitfiled width value, but some of them call isARM64BFMask() and arm64BFWidth() functions. In order to be consistent, this patch changes them all to use datasize. Besides, this patch sorts the codegen test cases. Run the "toolstash-check -all" command and find one inconsistent code is as the following. new: src/math/fma.go:104 BEQ 247 master: src/math/fma.go:104 BEQ 248 The above inconsistence is due to this patch changing the range of the field lsb value in "UBFIZ" optimization rules from "lc+(32|16|8)<64" to "lc<64", so that the following code is generated as "UBFIZ". The logical of changed code is still correct. The code of src/math/fma.go:160: const uvinf = 0x7FF0000000000000 func FMA(a, b uint32) float64 { ps := a+b return Float64frombits(uint64(ps)<<63 | uvinf) } The new assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 UBFIZ $63, R0, $1, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) The master assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 MOVWU R0, R0 LSL $63, R0, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) Change-Id: I9061104adfdfd3384d0525327ae1e5c8b0df5c35 Reviewed-on: https://go-review.googlesource.com/c/go/+/265038 Trust: fannie zhang <Fannie.Zhang@arm.com> Run-TryBot: fannie zhang <Fannie.Zhang@arm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2020-10-21 04:51:42 -06:00
func sbfx6(x int32) int64 {
return int64(x >> 30) // arm64:"SBFX\t[$]30, R[0-9]+, [$]2"
}
func sbfx7(x int16) int64 {
return int64(x >> 10) // arm64:"SBFX\t[$]10, R[0-9]+, [$]6"
}
func sbfx8(x int8) int64 {
return int64(x >> 5) // arm64:"SBFX\t[$]5, R[0-9]+, [$]3"
}
// sbfx combinations.
// merge shifts with sbfiz into sbfx.
func sbfx9(x int32) int32 {
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
return (x << 3) >> 4 // arm64:"SBFX\t[$]1, R[0-9]+, [$]28",-"LSL",-"ASR"
}
// merge sbfx and sign-extension into sbfx.
cmd/compile: simiplify arm64 bitfield optimizations In some rewrite rules for arm64 bitfield optimizations, the bitfield lsb value and the bitfield width value are related to datasize, some of them use datasize directly to check the bitfield lsb value is valid, to get the bitfiled width value, but some of them call isARM64BFMask() and arm64BFWidth() functions. In order to be consistent, this patch changes them all to use datasize. Besides, this patch sorts the codegen test cases. Run the "toolstash-check -all" command and find one inconsistent code is as the following. new: src/math/fma.go:104 BEQ 247 master: src/math/fma.go:104 BEQ 248 The above inconsistence is due to this patch changing the range of the field lsb value in "UBFIZ" optimization rules from "lc+(32|16|8)<64" to "lc<64", so that the following code is generated as "UBFIZ". The logical of changed code is still correct. The code of src/math/fma.go:160: const uvinf = 0x7FF0000000000000 func FMA(a, b uint32) float64 { ps := a+b return Float64frombits(uint64(ps)<<63 | uvinf) } The new assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 UBFIZ $63, R0, $1, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) The master assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 MOVWU R0, R0 LSL $63, R0, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) Change-Id: I9061104adfdfd3384d0525327ae1e5c8b0df5c35 Reviewed-on: https://go-review.googlesource.com/c/go/+/265038 Trust: fannie zhang <Fannie.Zhang@arm.com> Run-TryBot: fannie zhang <Fannie.Zhang@arm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2020-10-21 04:51:42 -06:00
func sbfx10(x int32) int64 {
c := x + 5
return int64(c >> 20) // arm64"SBFX\t[$]20, R[0-9]+, [$]12",-"MOVW\tR[0-9]+, R[0-9]+"
}
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
// ubfiz
cmd/compile: simiplify arm64 bitfield optimizations In some rewrite rules for arm64 bitfield optimizations, the bitfield lsb value and the bitfield width value are related to datasize, some of them use datasize directly to check the bitfield lsb value is valid, to get the bitfiled width value, but some of them call isARM64BFMask() and arm64BFWidth() functions. In order to be consistent, this patch changes them all to use datasize. Besides, this patch sorts the codegen test cases. Run the "toolstash-check -all" command and find one inconsistent code is as the following. new: src/math/fma.go:104 BEQ 247 master: src/math/fma.go:104 BEQ 248 The above inconsistence is due to this patch changing the range of the field lsb value in "UBFIZ" optimization rules from "lc+(32|16|8)<64" to "lc<64", so that the following code is generated as "UBFIZ". The logical of changed code is still correct. The code of src/math/fma.go:160: const uvinf = 0x7FF0000000000000 func FMA(a, b uint32) float64 { ps := a+b return Float64frombits(uint64(ps)<<63 | uvinf) } The new assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 UBFIZ $63, R0, $1, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) The master assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 MOVWU R0, R0 LSL $63, R0, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) Change-Id: I9061104adfdfd3384d0525327ae1e5c8b0df5c35 Reviewed-on: https://go-review.googlesource.com/c/go/+/265038 Trust: fannie zhang <Fannie.Zhang@arm.com> Run-TryBot: fannie zhang <Fannie.Zhang@arm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2020-10-21 04:51:42 -06:00
// merge shifts into ubfiz: (x<<lc)>>rc && lc>rc
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
func ubfiz1(x uint64) uint64 {
cmd/compile: simiplify arm64 bitfield optimizations In some rewrite rules for arm64 bitfield optimizations, the bitfield lsb value and the bitfield width value are related to datasize, some of them use datasize directly to check the bitfield lsb value is valid, to get the bitfiled width value, but some of them call isARM64BFMask() and arm64BFWidth() functions. In order to be consistent, this patch changes them all to use datasize. Besides, this patch sorts the codegen test cases. Run the "toolstash-check -all" command and find one inconsistent code is as the following. new: src/math/fma.go:104 BEQ 247 master: src/math/fma.go:104 BEQ 248 The above inconsistence is due to this patch changing the range of the field lsb value in "UBFIZ" optimization rules from "lc+(32|16|8)<64" to "lc<64", so that the following code is generated as "UBFIZ". The logical of changed code is still correct. The code of src/math/fma.go:160: const uvinf = 0x7FF0000000000000 func FMA(a, b uint32) float64 { ps := a+b return Float64frombits(uint64(ps)<<63 | uvinf) } The new assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 UBFIZ $63, R0, $1, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) The master assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 MOVWU R0, R0 LSL $63, R0, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) Change-Id: I9061104adfdfd3384d0525327ae1e5c8b0df5c35 Reviewed-on: https://go-review.googlesource.com/c/go/+/265038 Trust: fannie zhang <Fannie.Zhang@arm.com> Run-TryBot: fannie zhang <Fannie.Zhang@arm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2020-10-21 04:51:42 -06:00
// arm64:"UBFIZ\t[$]1, R[0-9]+, [$]60",-"LSL",-"LSR"
// s390x:"RISBGZ\t[$]3, [$]62, [$]1, ",-"SLD",-"SRD"
return (x << 4) >> 3
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
}
cmd/compile: simiplify arm64 bitfield optimizations In some rewrite rules for arm64 bitfield optimizations, the bitfield lsb value and the bitfield width value are related to datasize, some of them use datasize directly to check the bitfield lsb value is valid, to get the bitfiled width value, but some of them call isARM64BFMask() and arm64BFWidth() functions. In order to be consistent, this patch changes them all to use datasize. Besides, this patch sorts the codegen test cases. Run the "toolstash-check -all" command and find one inconsistent code is as the following. new: src/math/fma.go:104 BEQ 247 master: src/math/fma.go:104 BEQ 248 The above inconsistence is due to this patch changing the range of the field lsb value in "UBFIZ" optimization rules from "lc+(32|16|8)<64" to "lc<64", so that the following code is generated as "UBFIZ". The logical of changed code is still correct. The code of src/math/fma.go:160: const uvinf = 0x7FF0000000000000 func FMA(a, b uint32) float64 { ps := a+b return Float64frombits(uint64(ps)<<63 | uvinf) } The new assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 UBFIZ $63, R0, $1, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) The master assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 MOVWU R0, R0 LSL $63, R0, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) Change-Id: I9061104adfdfd3384d0525327ae1e5c8b0df5c35 Reviewed-on: https://go-review.googlesource.com/c/go/+/265038 Trust: fannie zhang <Fannie.Zhang@arm.com> Run-TryBot: fannie zhang <Fannie.Zhang@arm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2020-10-21 04:51:42 -06:00
// merge shift and zero-extension into ubfiz.
func ubfiz2(x uint32) uint64 {
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
return uint64(x+1) << 3 // arm64:"UBFIZ\t[$]3, R[0-9]+, [$]32",-"LSL"
}
cmd/compile: simiplify arm64 bitfield optimizations In some rewrite rules for arm64 bitfield optimizations, the bitfield lsb value and the bitfield width value are related to datasize, some of them use datasize directly to check the bitfield lsb value is valid, to get the bitfiled width value, but some of them call isARM64BFMask() and arm64BFWidth() functions. In order to be consistent, this patch changes them all to use datasize. Besides, this patch sorts the codegen test cases. Run the "toolstash-check -all" command and find one inconsistent code is as the following. new: src/math/fma.go:104 BEQ 247 master: src/math/fma.go:104 BEQ 248 The above inconsistence is due to this patch changing the range of the field lsb value in "UBFIZ" optimization rules from "lc+(32|16|8)<64" to "lc<64", so that the following code is generated as "UBFIZ". The logical of changed code is still correct. The code of src/math/fma.go:160: const uvinf = 0x7FF0000000000000 func FMA(a, b uint32) float64 { ps := a+b return Float64frombits(uint64(ps)<<63 | uvinf) } The new assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 UBFIZ $63, R0, $1, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) The master assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 MOVWU R0, R0 LSL $63, R0, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) Change-Id: I9061104adfdfd3384d0525327ae1e5c8b0df5c35 Reviewed-on: https://go-review.googlesource.com/c/go/+/265038 Trust: fannie zhang <Fannie.Zhang@arm.com> Run-TryBot: fannie zhang <Fannie.Zhang@arm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2020-10-21 04:51:42 -06:00
func ubfiz3(x uint16) uint64 {
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
return uint64(x+1) << 3 // arm64:"UBFIZ\t[$]3, R[0-9]+, [$]16",-"LSL"
}
cmd/compile: simiplify arm64 bitfield optimizations In some rewrite rules for arm64 bitfield optimizations, the bitfield lsb value and the bitfield width value are related to datasize, some of them use datasize directly to check the bitfield lsb value is valid, to get the bitfiled width value, but some of them call isARM64BFMask() and arm64BFWidth() functions. In order to be consistent, this patch changes them all to use datasize. Besides, this patch sorts the codegen test cases. Run the "toolstash-check -all" command and find one inconsistent code is as the following. new: src/math/fma.go:104 BEQ 247 master: src/math/fma.go:104 BEQ 248 The above inconsistence is due to this patch changing the range of the field lsb value in "UBFIZ" optimization rules from "lc+(32|16|8)<64" to "lc<64", so that the following code is generated as "UBFIZ". The logical of changed code is still correct. The code of src/math/fma.go:160: const uvinf = 0x7FF0000000000000 func FMA(a, b uint32) float64 { ps := a+b return Float64frombits(uint64(ps)<<63 | uvinf) } The new assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 UBFIZ $63, R0, $1, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) The master assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 MOVWU R0, R0 LSL $63, R0, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) Change-Id: I9061104adfdfd3384d0525327ae1e5c8b0df5c35 Reviewed-on: https://go-review.googlesource.com/c/go/+/265038 Trust: fannie zhang <Fannie.Zhang@arm.com> Run-TryBot: fannie zhang <Fannie.Zhang@arm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2020-10-21 04:51:42 -06:00
func ubfiz4(x uint8) uint64 {
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
return uint64(x+1) << 3 // arm64:"UBFIZ\t[$]3, R[0-9]+, [$]8",-"LSL"
}
cmd/compile: simiplify arm64 bitfield optimizations In some rewrite rules for arm64 bitfield optimizations, the bitfield lsb value and the bitfield width value are related to datasize, some of them use datasize directly to check the bitfield lsb value is valid, to get the bitfiled width value, but some of them call isARM64BFMask() and arm64BFWidth() functions. In order to be consistent, this patch changes them all to use datasize. Besides, this patch sorts the codegen test cases. Run the "toolstash-check -all" command and find one inconsistent code is as the following. new: src/math/fma.go:104 BEQ 247 master: src/math/fma.go:104 BEQ 248 The above inconsistence is due to this patch changing the range of the field lsb value in "UBFIZ" optimization rules from "lc+(32|16|8)<64" to "lc<64", so that the following code is generated as "UBFIZ". The logical of changed code is still correct. The code of src/math/fma.go:160: const uvinf = 0x7FF0000000000000 func FMA(a, b uint32) float64 { ps := a+b return Float64frombits(uint64(ps)<<63 | uvinf) } The new assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 UBFIZ $63, R0, $1, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) The master assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 MOVWU R0, R0 LSL $63, R0, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) Change-Id: I9061104adfdfd3384d0525327ae1e5c8b0df5c35 Reviewed-on: https://go-review.googlesource.com/c/go/+/265038 Trust: fannie zhang <Fannie.Zhang@arm.com> Run-TryBot: fannie zhang <Fannie.Zhang@arm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2020-10-21 04:51:42 -06:00
func ubfiz5(x uint8) uint64 {
return uint64(x) << 60 // arm64:"UBFIZ\t[$]60, R[0-9]+, [$]4",-"LSL"
}
func ubfiz6(x uint32) uint64 {
return uint64(x << 30) // arm64:"UBFIZ\t[$]30, R[0-9]+, [$]2",
}
func ubfiz7(x uint16) uint64 {
return uint64(x << 10) // arm64:"UBFIZ\t[$]10, R[0-9]+, [$]6",
}
func ubfiz8(x uint8) uint64 {
return uint64(x << 7) // arm64:"UBFIZ\t[$]7, R[0-9]+, [$]1",
}
// merge ANDconst into ubfiz.
func ubfiz9(x uint64) uint64 {
// arm64:"UBFIZ\t[$]3, R[0-9]+, [$]12",-"LSL",-"AND"
// s390x:"RISBGZ\t[$]49, [$]60, [$]3,",-"SLD",-"AND"
return (x & 0xfff) << 3
}
func ubfiz10(x uint64) uint64 {
// arm64:"UBFIZ\t[$]4, R[0-9]+, [$]12",-"LSL",-"AND"
// s390x:"RISBGZ\t[$]48, [$]59, [$]4,",-"SLD",-"AND"
return (x << 4) & 0xfff0
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
}
cmd/compile: simiplify arm64 bitfield optimizations In some rewrite rules for arm64 bitfield optimizations, the bitfield lsb value and the bitfield width value are related to datasize, some of them use datasize directly to check the bitfield lsb value is valid, to get the bitfiled width value, but some of them call isARM64BFMask() and arm64BFWidth() functions. In order to be consistent, this patch changes them all to use datasize. Besides, this patch sorts the codegen test cases. Run the "toolstash-check -all" command and find one inconsistent code is as the following. new: src/math/fma.go:104 BEQ 247 master: src/math/fma.go:104 BEQ 248 The above inconsistence is due to this patch changing the range of the field lsb value in "UBFIZ" optimization rules from "lc+(32|16|8)<64" to "lc<64", so that the following code is generated as "UBFIZ". The logical of changed code is still correct. The code of src/math/fma.go:160: const uvinf = 0x7FF0000000000000 func FMA(a, b uint32) float64 { ps := a+b return Float64frombits(uint64(ps)<<63 | uvinf) } The new assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 UBFIZ $63, R0, $1, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) The master assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 MOVWU R0, R0 LSL $63, R0, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) Change-Id: I9061104adfdfd3384d0525327ae1e5c8b0df5c35 Reviewed-on: https://go-review.googlesource.com/c/go/+/265038 Trust: fannie zhang <Fannie.Zhang@arm.com> Run-TryBot: fannie zhang <Fannie.Zhang@arm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2020-10-21 04:51:42 -06:00
// ubfiz combinations
func ubfiz11(x uint32) uint32 {
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
// arm64:"UBFIZ\t[$]1, R[0-9]+, [$]28",-"LSL",-"LSR"
return (x << 4) >> 3
}
cmd/compile: simiplify arm64 bitfield optimizations In some rewrite rules for arm64 bitfield optimizations, the bitfield lsb value and the bitfield width value are related to datasize, some of them use datasize directly to check the bitfield lsb value is valid, to get the bitfiled width value, but some of them call isARM64BFMask() and arm64BFWidth() functions. In order to be consistent, this patch changes them all to use datasize. Besides, this patch sorts the codegen test cases. Run the "toolstash-check -all" command and find one inconsistent code is as the following. new: src/math/fma.go:104 BEQ 247 master: src/math/fma.go:104 BEQ 248 The above inconsistence is due to this patch changing the range of the field lsb value in "UBFIZ" optimization rules from "lc+(32|16|8)<64" to "lc<64", so that the following code is generated as "UBFIZ". The logical of changed code is still correct. The code of src/math/fma.go:160: const uvinf = 0x7FF0000000000000 func FMA(a, b uint32) float64 { ps := a+b return Float64frombits(uint64(ps)<<63 | uvinf) } The new assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 UBFIZ $63, R0, $1, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) The master assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 MOVWU R0, R0 LSL $63, R0, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) Change-Id: I9061104adfdfd3384d0525327ae1e5c8b0df5c35 Reviewed-on: https://go-review.googlesource.com/c/go/+/265038 Trust: fannie zhang <Fannie.Zhang@arm.com> Run-TryBot: fannie zhang <Fannie.Zhang@arm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2020-10-21 04:51:42 -06:00
func ubfiz12(x uint64) uint64 {
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
// arm64:"UBFIZ\t[$]1, R[0-9]+, [$]20",-"LSL",-"LSR"
cmd/compile: optimize shift pairs and masks on s390x Optimize combinations of left and right shifts by a constant value into a 'rotate then insert selected bits [into zero]' instruction. Use the same instruction for contiguous masks since it has some benefits over 'and immediate' (not restricted to 32-bits, does not overwrite source register). To keep the complexity of this change under control I've only implemented 64 bit operations for now. There are a lot more optimizations that can be done with this instruction family. However, since their function overlaps with other instructions we need to be somewhat careful not to break existing optimization rules by creating optimization dead ends. This is particularly true of the load/store merging rules which contain lots of zero extensions and shifts. This CL does interfere with the store merging rules when an operand is shifted left before it is stored: binary.BigEndian.PutUint64(b, x << 1) This is unfortunate but it's not critical and somewhat complex so I plan to fix that in a follow up CL. file before after Δ % addr2line 4117446 4117282 -164 -0.004% api 4945184 4942752 -2432 -0.049% asm 4998079 4991891 -6188 -0.124% buildid 2685158 2684074 -1084 -0.040% cgo 4553732 4553394 -338 -0.007% compile 19294446 19245070 -49376 -0.256% cover 4897105 4891319 -5786 -0.118% dist 3544389 3542785 -1604 -0.045% doc 3926795 3927617 +822 +0.021% fix 3302958 3293868 -9090 -0.275% link 6546274 6543456 -2818 -0.043% nm 4102021 4100825 -1196 -0.029% objdump 4542431 4548483 +6052 +0.133% pack 2482465 2416389 -66076 -2.662% pprof 13366541 13363915 -2626 -0.020% test2json 2829007 2761515 -67492 -2.386% trace 10216164 10219684 +3520 +0.034% vet 6773956 6773572 -384 -0.006% total 107124151 106917891 -206260 -0.193% Change-Id: I7591cce41e06867ba10a745daae9333513062746 Reviewed-on: https://go-review.googlesource.com/c/go/+/233317 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org> Trust: Michael Munday <mike.munday@ibm.com>
2020-05-11 10:44:48 -06:00
// s390x:"RISBGZ\t[$]43, [$]62, [$]1, ",-"SLD",-"SRD",-"AND"
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
return ((x & 0xfffff) << 4) >> 3
}
cmd/compile: simiplify arm64 bitfield optimizations In some rewrite rules for arm64 bitfield optimizations, the bitfield lsb value and the bitfield width value are related to datasize, some of them use datasize directly to check the bitfield lsb value is valid, to get the bitfiled width value, but some of them call isARM64BFMask() and arm64BFWidth() functions. In order to be consistent, this patch changes them all to use datasize. Besides, this patch sorts the codegen test cases. Run the "toolstash-check -all" command and find one inconsistent code is as the following. new: src/math/fma.go:104 BEQ 247 master: src/math/fma.go:104 BEQ 248 The above inconsistence is due to this patch changing the range of the field lsb value in "UBFIZ" optimization rules from "lc+(32|16|8)<64" to "lc<64", so that the following code is generated as "UBFIZ". The logical of changed code is still correct. The code of src/math/fma.go:160: const uvinf = 0x7FF0000000000000 func FMA(a, b uint32) float64 { ps := a+b return Float64frombits(uint64(ps)<<63 | uvinf) } The new assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 UBFIZ $63, R0, $1, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) The master assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 MOVWU R0, R0 LSL $63, R0, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) Change-Id: I9061104adfdfd3384d0525327ae1e5c8b0df5c35 Reviewed-on: https://go-review.googlesource.com/c/go/+/265038 Trust: fannie zhang <Fannie.Zhang@arm.com> Run-TryBot: fannie zhang <Fannie.Zhang@arm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2020-10-21 04:51:42 -06:00
func ubfiz13(x uint64) uint64 {
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
// arm64:"UBFIZ\t[$]5, R[0-9]+, [$]13",-"LSL",-"LSR",-"AND"
return ((x << 3) & 0xffff) << 2
}
cmd/compile: simiplify arm64 bitfield optimizations In some rewrite rules for arm64 bitfield optimizations, the bitfield lsb value and the bitfield width value are related to datasize, some of them use datasize directly to check the bitfield lsb value is valid, to get the bitfiled width value, but some of them call isARM64BFMask() and arm64BFWidth() functions. In order to be consistent, this patch changes them all to use datasize. Besides, this patch sorts the codegen test cases. Run the "toolstash-check -all" command and find one inconsistent code is as the following. new: src/math/fma.go:104 BEQ 247 master: src/math/fma.go:104 BEQ 248 The above inconsistence is due to this patch changing the range of the field lsb value in "UBFIZ" optimization rules from "lc+(32|16|8)<64" to "lc<64", so that the following code is generated as "UBFIZ". The logical of changed code is still correct. The code of src/math/fma.go:160: const uvinf = 0x7FF0000000000000 func FMA(a, b uint32) float64 { ps := a+b return Float64frombits(uint64(ps)<<63 | uvinf) } The new assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 UBFIZ $63, R0, $1, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) The master assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 MOVWU R0, R0 LSL $63, R0, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) Change-Id: I9061104adfdfd3384d0525327ae1e5c8b0df5c35 Reviewed-on: https://go-review.googlesource.com/c/go/+/265038 Trust: fannie zhang <Fannie.Zhang@arm.com> Run-TryBot: fannie zhang <Fannie.Zhang@arm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2020-10-21 04:51:42 -06:00
func ubfiz14(x uint64) uint64 {
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
// arm64:"UBFIZ\t[$]7, R[0-9]+, [$]12",-"LSL",-"LSR",-"AND"
cmd/compile: optimize shift pairs and masks on s390x Optimize combinations of left and right shifts by a constant value into a 'rotate then insert selected bits [into zero]' instruction. Use the same instruction for contiguous masks since it has some benefits over 'and immediate' (not restricted to 32-bits, does not overwrite source register). To keep the complexity of this change under control I've only implemented 64 bit operations for now. There are a lot more optimizations that can be done with this instruction family. However, since their function overlaps with other instructions we need to be somewhat careful not to break existing optimization rules by creating optimization dead ends. This is particularly true of the load/store merging rules which contain lots of zero extensions and shifts. This CL does interfere with the store merging rules when an operand is shifted left before it is stored: binary.BigEndian.PutUint64(b, x << 1) This is unfortunate but it's not critical and somewhat complex so I plan to fix that in a follow up CL. file before after Δ % addr2line 4117446 4117282 -164 -0.004% api 4945184 4942752 -2432 -0.049% asm 4998079 4991891 -6188 -0.124% buildid 2685158 2684074 -1084 -0.040% cgo 4553732 4553394 -338 -0.007% compile 19294446 19245070 -49376 -0.256% cover 4897105 4891319 -5786 -0.118% dist 3544389 3542785 -1604 -0.045% doc 3926795 3927617 +822 +0.021% fix 3302958 3293868 -9090 -0.275% link 6546274 6543456 -2818 -0.043% nm 4102021 4100825 -1196 -0.029% objdump 4542431 4548483 +6052 +0.133% pack 2482465 2416389 -66076 -2.662% pprof 13366541 13363915 -2626 -0.020% test2json 2829007 2761515 -67492 -2.386% trace 10216164 10219684 +3520 +0.034% vet 6773956 6773572 -384 -0.006% total 107124151 106917891 -206260 -0.193% Change-Id: I7591cce41e06867ba10a745daae9333513062746 Reviewed-on: https://go-review.googlesource.com/c/go/+/233317 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org> Trust: Michael Munday <mike.munday@ibm.com>
2020-05-11 10:44:48 -06:00
// s390x:"RISBGZ\t[$]45, [$]56, [$]7, ",-"SLD",-"SRD",-"AND"
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
return ((x << 5) & (0xfff << 5)) << 2
}
// ubfx
cmd/compile: simiplify arm64 bitfield optimizations In some rewrite rules for arm64 bitfield optimizations, the bitfield lsb value and the bitfield width value are related to datasize, some of them use datasize directly to check the bitfield lsb value is valid, to get the bitfiled width value, but some of them call isARM64BFMask() and arm64BFWidth() functions. In order to be consistent, this patch changes them all to use datasize. Besides, this patch sorts the codegen test cases. Run the "toolstash-check -all" command and find one inconsistent code is as the following. new: src/math/fma.go:104 BEQ 247 master: src/math/fma.go:104 BEQ 248 The above inconsistence is due to this patch changing the range of the field lsb value in "UBFIZ" optimization rules from "lc+(32|16|8)<64" to "lc<64", so that the following code is generated as "UBFIZ". The logical of changed code is still correct. The code of src/math/fma.go:160: const uvinf = 0x7FF0000000000000 func FMA(a, b uint32) float64 { ps := a+b return Float64frombits(uint64(ps)<<63 | uvinf) } The new assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 UBFIZ $63, R0, $1, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) The master assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 MOVWU R0, R0 LSL $63, R0, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) Change-Id: I9061104adfdfd3384d0525327ae1e5c8b0df5c35 Reviewed-on: https://go-review.googlesource.com/c/go/+/265038 Trust: fannie zhang <Fannie.Zhang@arm.com> Run-TryBot: fannie zhang <Fannie.Zhang@arm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2020-10-21 04:51:42 -06:00
// merge shifts into ubfx: (x<<lc)>>rc && lc<rc
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
func ubfx1(x uint64) uint64 {
cmd/compile: simiplify arm64 bitfield optimizations In some rewrite rules for arm64 bitfield optimizations, the bitfield lsb value and the bitfield width value are related to datasize, some of them use datasize directly to check the bitfield lsb value is valid, to get the bitfiled width value, but some of them call isARM64BFMask() and arm64BFWidth() functions. In order to be consistent, this patch changes them all to use datasize. Besides, this patch sorts the codegen test cases. Run the "toolstash-check -all" command and find one inconsistent code is as the following. new: src/math/fma.go:104 BEQ 247 master: src/math/fma.go:104 BEQ 248 The above inconsistence is due to this patch changing the range of the field lsb value in "UBFIZ" optimization rules from "lc+(32|16|8)<64" to "lc<64", so that the following code is generated as "UBFIZ". The logical of changed code is still correct. The code of src/math/fma.go:160: const uvinf = 0x7FF0000000000000 func FMA(a, b uint32) float64 { ps := a+b return Float64frombits(uint64(ps)<<63 | uvinf) } The new assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 UBFIZ $63, R0, $1, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) The master assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 MOVWU R0, R0 LSL $63, R0, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) Change-Id: I9061104adfdfd3384d0525327ae1e5c8b0df5c35 Reviewed-on: https://go-review.googlesource.com/c/go/+/265038 Trust: fannie zhang <Fannie.Zhang@arm.com> Run-TryBot: fannie zhang <Fannie.Zhang@arm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2020-10-21 04:51:42 -06:00
// arm64:"UBFX\t[$]1, R[0-9]+, [$]62",-"LSL",-"LSR"
// s390x:"RISBGZ\t[$]2, [$]63, [$]63,",-"SLD",-"SRD"
return (x << 1) >> 2
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
}
cmd/compile: simiplify arm64 bitfield optimizations In some rewrite rules for arm64 bitfield optimizations, the bitfield lsb value and the bitfield width value are related to datasize, some of them use datasize directly to check the bitfield lsb value is valid, to get the bitfiled width value, but some of them call isARM64BFMask() and arm64BFWidth() functions. In order to be consistent, this patch changes them all to use datasize. Besides, this patch sorts the codegen test cases. Run the "toolstash-check -all" command and find one inconsistent code is as the following. new: src/math/fma.go:104 BEQ 247 master: src/math/fma.go:104 BEQ 248 The above inconsistence is due to this patch changing the range of the field lsb value in "UBFIZ" optimization rules from "lc+(32|16|8)<64" to "lc<64", so that the following code is generated as "UBFIZ". The logical of changed code is still correct. The code of src/math/fma.go:160: const uvinf = 0x7FF0000000000000 func FMA(a, b uint32) float64 { ps := a+b return Float64frombits(uint64(ps)<<63 | uvinf) } The new assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 UBFIZ $63, R0, $1, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) The master assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 MOVWU R0, R0 LSL $63, R0, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) Change-Id: I9061104adfdfd3384d0525327ae1e5c8b0df5c35 Reviewed-on: https://go-review.googlesource.com/c/go/+/265038 Trust: fannie zhang <Fannie.Zhang@arm.com> Run-TryBot: fannie zhang <Fannie.Zhang@arm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2020-10-21 04:51:42 -06:00
// merge shift and zero-extension into ubfx.
func ubfx2(x uint32) uint64 {
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
return uint64(x >> 15) // arm64:"UBFX\t[$]15, R[0-9]+, [$]17",-"LSR"
}
cmd/compile: simiplify arm64 bitfield optimizations In some rewrite rules for arm64 bitfield optimizations, the bitfield lsb value and the bitfield width value are related to datasize, some of them use datasize directly to check the bitfield lsb value is valid, to get the bitfiled width value, but some of them call isARM64BFMask() and arm64BFWidth() functions. In order to be consistent, this patch changes them all to use datasize. Besides, this patch sorts the codegen test cases. Run the "toolstash-check -all" command and find one inconsistent code is as the following. new: src/math/fma.go:104 BEQ 247 master: src/math/fma.go:104 BEQ 248 The above inconsistence is due to this patch changing the range of the field lsb value in "UBFIZ" optimization rules from "lc+(32|16|8)<64" to "lc<64", so that the following code is generated as "UBFIZ". The logical of changed code is still correct. The code of src/math/fma.go:160: const uvinf = 0x7FF0000000000000 func FMA(a, b uint32) float64 { ps := a+b return Float64frombits(uint64(ps)<<63 | uvinf) } The new assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 UBFIZ $63, R0, $1, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) The master assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 MOVWU R0, R0 LSL $63, R0, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) Change-Id: I9061104adfdfd3384d0525327ae1e5c8b0df5c35 Reviewed-on: https://go-review.googlesource.com/c/go/+/265038 Trust: fannie zhang <Fannie.Zhang@arm.com> Run-TryBot: fannie zhang <Fannie.Zhang@arm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2020-10-21 04:51:42 -06:00
func ubfx3(x uint16) uint64 {
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
return uint64(x >> 9) // arm64:"UBFX\t[$]9, R[0-9]+, [$]7",-"LSR"
}
cmd/compile: simiplify arm64 bitfield optimizations In some rewrite rules for arm64 bitfield optimizations, the bitfield lsb value and the bitfield width value are related to datasize, some of them use datasize directly to check the bitfield lsb value is valid, to get the bitfiled width value, but some of them call isARM64BFMask() and arm64BFWidth() functions. In order to be consistent, this patch changes them all to use datasize. Besides, this patch sorts the codegen test cases. Run the "toolstash-check -all" command and find one inconsistent code is as the following. new: src/math/fma.go:104 BEQ 247 master: src/math/fma.go:104 BEQ 248 The above inconsistence is due to this patch changing the range of the field lsb value in "UBFIZ" optimization rules from "lc+(32|16|8)<64" to "lc<64", so that the following code is generated as "UBFIZ". The logical of changed code is still correct. The code of src/math/fma.go:160: const uvinf = 0x7FF0000000000000 func FMA(a, b uint32) float64 { ps := a+b return Float64frombits(uint64(ps)<<63 | uvinf) } The new assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 UBFIZ $63, R0, $1, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) The master assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 MOVWU R0, R0 LSL $63, R0, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) Change-Id: I9061104adfdfd3384d0525327ae1e5c8b0df5c35 Reviewed-on: https://go-review.googlesource.com/c/go/+/265038 Trust: fannie zhang <Fannie.Zhang@arm.com> Run-TryBot: fannie zhang <Fannie.Zhang@arm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2020-10-21 04:51:42 -06:00
func ubfx4(x uint8) uint64 {
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
return uint64(x >> 3) // arm64:"UBFX\t[$]3, R[0-9]+, [$]5",-"LSR"
}
cmd/compile: simiplify arm64 bitfield optimizations In some rewrite rules for arm64 bitfield optimizations, the bitfield lsb value and the bitfield width value are related to datasize, some of them use datasize directly to check the bitfield lsb value is valid, to get the bitfiled width value, but some of them call isARM64BFMask() and arm64BFWidth() functions. In order to be consistent, this patch changes them all to use datasize. Besides, this patch sorts the codegen test cases. Run the "toolstash-check -all" command and find one inconsistent code is as the following. new: src/math/fma.go:104 BEQ 247 master: src/math/fma.go:104 BEQ 248 The above inconsistence is due to this patch changing the range of the field lsb value in "UBFIZ" optimization rules from "lc+(32|16|8)<64" to "lc<64", so that the following code is generated as "UBFIZ". The logical of changed code is still correct. The code of src/math/fma.go:160: const uvinf = 0x7FF0000000000000 func FMA(a, b uint32) float64 { ps := a+b return Float64frombits(uint64(ps)<<63 | uvinf) } The new assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 UBFIZ $63, R0, $1, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) The master assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 MOVWU R0, R0 LSL $63, R0, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) Change-Id: I9061104adfdfd3384d0525327ae1e5c8b0df5c35 Reviewed-on: https://go-review.googlesource.com/c/go/+/265038 Trust: fannie zhang <Fannie.Zhang@arm.com> Run-TryBot: fannie zhang <Fannie.Zhang@arm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2020-10-21 04:51:42 -06:00
func ubfx5(x uint32) uint64 {
return uint64(x) >> 30 // arm64:"UBFX\t[$]30, R[0-9]+, [$]2"
}
func ubfx6(x uint16) uint64 {
return uint64(x) >> 10 // arm64:"UBFX\t[$]10, R[0-9]+, [$]6"
}
func ubfx7(x uint8) uint64 {
return uint64(x) >> 3 // arm64:"UBFX\t[$]3, R[0-9]+, [$]5"
}
// merge ANDconst into ubfx.
func ubfx8(x uint64) uint64 {
// arm64:"UBFX\t[$]25, R[0-9]+, [$]10",-"LSR",-"AND"
// s390x:"RISBGZ\t[$]54, [$]63, [$]39, ",-"SRD",-"AND"
return (x >> 25) & 1023
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
}
cmd/compile: simiplify arm64 bitfield optimizations In some rewrite rules for arm64 bitfield optimizations, the bitfield lsb value and the bitfield width value are related to datasize, some of them use datasize directly to check the bitfield lsb value is valid, to get the bitfiled width value, but some of them call isARM64BFMask() and arm64BFWidth() functions. In order to be consistent, this patch changes them all to use datasize. Besides, this patch sorts the codegen test cases. Run the "toolstash-check -all" command and find one inconsistent code is as the following. new: src/math/fma.go:104 BEQ 247 master: src/math/fma.go:104 BEQ 248 The above inconsistence is due to this patch changing the range of the field lsb value in "UBFIZ" optimization rules from "lc+(32|16|8)<64" to "lc<64", so that the following code is generated as "UBFIZ". The logical of changed code is still correct. The code of src/math/fma.go:160: const uvinf = 0x7FF0000000000000 func FMA(a, b uint32) float64 { ps := a+b return Float64frombits(uint64(ps)<<63 | uvinf) } The new assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 UBFIZ $63, R0, $1, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) The master assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 MOVWU R0, R0 LSL $63, R0, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) Change-Id: I9061104adfdfd3384d0525327ae1e5c8b0df5c35 Reviewed-on: https://go-review.googlesource.com/c/go/+/265038 Trust: fannie zhang <Fannie.Zhang@arm.com> Run-TryBot: fannie zhang <Fannie.Zhang@arm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2020-10-21 04:51:42 -06:00
func ubfx9(x uint64) uint64 {
// arm64:"UBFX\t[$]4, R[0-9]+, [$]8",-"LSR",-"AND"
// s390x:"RISBGZ\t[$]56, [$]63, [$]60, ",-"SRD",-"AND"
return (x & 0x0ff0) >> 4
}
// ubfx combinations.
func ubfx10(x uint32) uint32 {
cmd/compile: optimize shift pairs and masks on s390x Optimize combinations of left and right shifts by a constant value into a 'rotate then insert selected bits [into zero]' instruction. Use the same instruction for contiguous masks since it has some benefits over 'and immediate' (not restricted to 32-bits, does not overwrite source register). To keep the complexity of this change under control I've only implemented 64 bit operations for now. There are a lot more optimizations that can be done with this instruction family. However, since their function overlaps with other instructions we need to be somewhat careful not to break existing optimization rules by creating optimization dead ends. This is particularly true of the load/store merging rules which contain lots of zero extensions and shifts. This CL does interfere with the store merging rules when an operand is shifted left before it is stored: binary.BigEndian.PutUint64(b, x << 1) This is unfortunate but it's not critical and somewhat complex so I plan to fix that in a follow up CL. file before after Δ % addr2line 4117446 4117282 -164 -0.004% api 4945184 4942752 -2432 -0.049% asm 4998079 4991891 -6188 -0.124% buildid 2685158 2684074 -1084 -0.040% cgo 4553732 4553394 -338 -0.007% compile 19294446 19245070 -49376 -0.256% cover 4897105 4891319 -5786 -0.118% dist 3544389 3542785 -1604 -0.045% doc 3926795 3927617 +822 +0.021% fix 3302958 3293868 -9090 -0.275% link 6546274 6543456 -2818 -0.043% nm 4102021 4100825 -1196 -0.029% objdump 4542431 4548483 +6052 +0.133% pack 2482465 2416389 -66076 -2.662% pprof 13366541 13363915 -2626 -0.020% test2json 2829007 2761515 -67492 -2.386% trace 10216164 10219684 +3520 +0.034% vet 6773956 6773572 -384 -0.006% total 107124151 106917891 -206260 -0.193% Change-Id: I7591cce41e06867ba10a745daae9333513062746 Reviewed-on: https://go-review.googlesource.com/c/go/+/233317 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org> Trust: Michael Munday <mike.munday@ibm.com>
2020-05-11 10:44:48 -06:00
// arm64:"UBFX\t[$]1, R[0-9]+, [$]30",-"LSL",-"LSR"
return (x << 1) >> 2
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
}
cmd/compile: simiplify arm64 bitfield optimizations In some rewrite rules for arm64 bitfield optimizations, the bitfield lsb value and the bitfield width value are related to datasize, some of them use datasize directly to check the bitfield lsb value is valid, to get the bitfiled width value, but some of them call isARM64BFMask() and arm64BFWidth() functions. In order to be consistent, this patch changes them all to use datasize. Besides, this patch sorts the codegen test cases. Run the "toolstash-check -all" command and find one inconsistent code is as the following. new: src/math/fma.go:104 BEQ 247 master: src/math/fma.go:104 BEQ 248 The above inconsistence is due to this patch changing the range of the field lsb value in "UBFIZ" optimization rules from "lc+(32|16|8)<64" to "lc<64", so that the following code is generated as "UBFIZ". The logical of changed code is still correct. The code of src/math/fma.go:160: const uvinf = 0x7FF0000000000000 func FMA(a, b uint32) float64 { ps := a+b return Float64frombits(uint64(ps)<<63 | uvinf) } The new assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 UBFIZ $63, R0, $1, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) The master assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 MOVWU R0, R0 LSL $63, R0, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) Change-Id: I9061104adfdfd3384d0525327ae1e5c8b0df5c35 Reviewed-on: https://go-review.googlesource.com/c/go/+/265038 Trust: fannie zhang <Fannie.Zhang@arm.com> Run-TryBot: fannie zhang <Fannie.Zhang@arm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2020-10-21 04:51:42 -06:00
func ubfx11(x uint64) uint64 {
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
// arm64:"UBFX\t[$]1, R[0-9]+, [$]12",-"LSL",-"LSR",-"AND"
cmd/compile: optimize shift pairs and masks on s390x Optimize combinations of left and right shifts by a constant value into a 'rotate then insert selected bits [into zero]' instruction. Use the same instruction for contiguous masks since it has some benefits over 'and immediate' (not restricted to 32-bits, does not overwrite source register). To keep the complexity of this change under control I've only implemented 64 bit operations for now. There are a lot more optimizations that can be done with this instruction family. However, since their function overlaps with other instructions we need to be somewhat careful not to break existing optimization rules by creating optimization dead ends. This is particularly true of the load/store merging rules which contain lots of zero extensions and shifts. This CL does interfere with the store merging rules when an operand is shifted left before it is stored: binary.BigEndian.PutUint64(b, x << 1) This is unfortunate but it's not critical and somewhat complex so I plan to fix that in a follow up CL. file before after Δ % addr2line 4117446 4117282 -164 -0.004% api 4945184 4942752 -2432 -0.049% asm 4998079 4991891 -6188 -0.124% buildid 2685158 2684074 -1084 -0.040% cgo 4553732 4553394 -338 -0.007% compile 19294446 19245070 -49376 -0.256% cover 4897105 4891319 -5786 -0.118% dist 3544389 3542785 -1604 -0.045% doc 3926795 3927617 +822 +0.021% fix 3302958 3293868 -9090 -0.275% link 6546274 6543456 -2818 -0.043% nm 4102021 4100825 -1196 -0.029% objdump 4542431 4548483 +6052 +0.133% pack 2482465 2416389 -66076 -2.662% pprof 13366541 13363915 -2626 -0.020% test2json 2829007 2761515 -67492 -2.386% trace 10216164 10219684 +3520 +0.034% vet 6773956 6773572 -384 -0.006% total 107124151 106917891 -206260 -0.193% Change-Id: I7591cce41e06867ba10a745daae9333513062746 Reviewed-on: https://go-review.googlesource.com/c/go/+/233317 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org> Trust: Michael Munday <mike.munday@ibm.com>
2020-05-11 10:44:48 -06:00
// s390x:"RISBGZ\t[$]52, [$]63, [$]63,",-"SLD",-"SRD",-"AND"
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
return ((x << 1) >> 2) & 0xfff
}
cmd/compile: simiplify arm64 bitfield optimizations In some rewrite rules for arm64 bitfield optimizations, the bitfield lsb value and the bitfield width value are related to datasize, some of them use datasize directly to check the bitfield lsb value is valid, to get the bitfiled width value, but some of them call isARM64BFMask() and arm64BFWidth() functions. In order to be consistent, this patch changes them all to use datasize. Besides, this patch sorts the codegen test cases. Run the "toolstash-check -all" command and find one inconsistent code is as the following. new: src/math/fma.go:104 BEQ 247 master: src/math/fma.go:104 BEQ 248 The above inconsistence is due to this patch changing the range of the field lsb value in "UBFIZ" optimization rules from "lc+(32|16|8)<64" to "lc<64", so that the following code is generated as "UBFIZ". The logical of changed code is still correct. The code of src/math/fma.go:160: const uvinf = 0x7FF0000000000000 func FMA(a, b uint32) float64 { ps := a+b return Float64frombits(uint64(ps)<<63 | uvinf) } The new assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 UBFIZ $63, R0, $1, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) The master assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 MOVWU R0, R0 LSL $63, R0, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) Change-Id: I9061104adfdfd3384d0525327ae1e5c8b0df5c35 Reviewed-on: https://go-review.googlesource.com/c/go/+/265038 Trust: fannie zhang <Fannie.Zhang@arm.com> Run-TryBot: fannie zhang <Fannie.Zhang@arm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2020-10-21 04:51:42 -06:00
func ubfx12(x uint64) uint64 {
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
// arm64:"UBFX\t[$]4, R[0-9]+, [$]11",-"LSL",-"LSR",-"AND"
cmd/compile: optimize shift pairs and masks on s390x Optimize combinations of left and right shifts by a constant value into a 'rotate then insert selected bits [into zero]' instruction. Use the same instruction for contiguous masks since it has some benefits over 'and immediate' (not restricted to 32-bits, does not overwrite source register). To keep the complexity of this change under control I've only implemented 64 bit operations for now. There are a lot more optimizations that can be done with this instruction family. However, since their function overlaps with other instructions we need to be somewhat careful not to break existing optimization rules by creating optimization dead ends. This is particularly true of the load/store merging rules which contain lots of zero extensions and shifts. This CL does interfere with the store merging rules when an operand is shifted left before it is stored: binary.BigEndian.PutUint64(b, x << 1) This is unfortunate but it's not critical and somewhat complex so I plan to fix that in a follow up CL. file before after Δ % addr2line 4117446 4117282 -164 -0.004% api 4945184 4942752 -2432 -0.049% asm 4998079 4991891 -6188 -0.124% buildid 2685158 2684074 -1084 -0.040% cgo 4553732 4553394 -338 -0.007% compile 19294446 19245070 -49376 -0.256% cover 4897105 4891319 -5786 -0.118% dist 3544389 3542785 -1604 -0.045% doc 3926795 3927617 +822 +0.021% fix 3302958 3293868 -9090 -0.275% link 6546274 6543456 -2818 -0.043% nm 4102021 4100825 -1196 -0.029% objdump 4542431 4548483 +6052 +0.133% pack 2482465 2416389 -66076 -2.662% pprof 13366541 13363915 -2626 -0.020% test2json 2829007 2761515 -67492 -2.386% trace 10216164 10219684 +3520 +0.034% vet 6773956 6773572 -384 -0.006% total 107124151 106917891 -206260 -0.193% Change-Id: I7591cce41e06867ba10a745daae9333513062746 Reviewed-on: https://go-review.googlesource.com/c/go/+/233317 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org> Trust: Michael Munday <mike.munday@ibm.com>
2020-05-11 10:44:48 -06:00
// s390x:"RISBGZ\t[$]53, [$]63, [$]60, ",-"SLD",-"SRD",-"AND"
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
return ((x >> 3) & 0xfff) >> 1
}
cmd/compile: simiplify arm64 bitfield optimizations In some rewrite rules for arm64 bitfield optimizations, the bitfield lsb value and the bitfield width value are related to datasize, some of them use datasize directly to check the bitfield lsb value is valid, to get the bitfiled width value, but some of them call isARM64BFMask() and arm64BFWidth() functions. In order to be consistent, this patch changes them all to use datasize. Besides, this patch sorts the codegen test cases. Run the "toolstash-check -all" command and find one inconsistent code is as the following. new: src/math/fma.go:104 BEQ 247 master: src/math/fma.go:104 BEQ 248 The above inconsistence is due to this patch changing the range of the field lsb value in "UBFIZ" optimization rules from "lc+(32|16|8)<64" to "lc<64", so that the following code is generated as "UBFIZ". The logical of changed code is still correct. The code of src/math/fma.go:160: const uvinf = 0x7FF0000000000000 func FMA(a, b uint32) float64 { ps := a+b return Float64frombits(uint64(ps)<<63 | uvinf) } The new assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 UBFIZ $63, R0, $1, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) The master assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 MOVWU R0, R0 LSL $63, R0, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) Change-Id: I9061104adfdfd3384d0525327ae1e5c8b0df5c35 Reviewed-on: https://go-review.googlesource.com/c/go/+/265038 Trust: fannie zhang <Fannie.Zhang@arm.com> Run-TryBot: fannie zhang <Fannie.Zhang@arm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2020-10-21 04:51:42 -06:00
func ubfx13(x uint64) uint64 {
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
// arm64:"UBFX\t[$]5, R[0-9]+, [$]56",-"LSL",-"LSR"
cmd/compile: optimize shift pairs and masks on s390x Optimize combinations of left and right shifts by a constant value into a 'rotate then insert selected bits [into zero]' instruction. Use the same instruction for contiguous masks since it has some benefits over 'and immediate' (not restricted to 32-bits, does not overwrite source register). To keep the complexity of this change under control I've only implemented 64 bit operations for now. There are a lot more optimizations that can be done with this instruction family. However, since their function overlaps with other instructions we need to be somewhat careful not to break existing optimization rules by creating optimization dead ends. This is particularly true of the load/store merging rules which contain lots of zero extensions and shifts. This CL does interfere with the store merging rules when an operand is shifted left before it is stored: binary.BigEndian.PutUint64(b, x << 1) This is unfortunate but it's not critical and somewhat complex so I plan to fix that in a follow up CL. file before after Δ % addr2line 4117446 4117282 -164 -0.004% api 4945184 4942752 -2432 -0.049% asm 4998079 4991891 -6188 -0.124% buildid 2685158 2684074 -1084 -0.040% cgo 4553732 4553394 -338 -0.007% compile 19294446 19245070 -49376 -0.256% cover 4897105 4891319 -5786 -0.118% dist 3544389 3542785 -1604 -0.045% doc 3926795 3927617 +822 +0.021% fix 3302958 3293868 -9090 -0.275% link 6546274 6543456 -2818 -0.043% nm 4102021 4100825 -1196 -0.029% objdump 4542431 4548483 +6052 +0.133% pack 2482465 2416389 -66076 -2.662% pprof 13366541 13363915 -2626 -0.020% test2json 2829007 2761515 -67492 -2.386% trace 10216164 10219684 +3520 +0.034% vet 6773956 6773572 -384 -0.006% total 107124151 106917891 -206260 -0.193% Change-Id: I7591cce41e06867ba10a745daae9333513062746 Reviewed-on: https://go-review.googlesource.com/c/go/+/233317 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org> Trust: Michael Munday <mike.munday@ibm.com>
2020-05-11 10:44:48 -06:00
// s390x:"RISBGZ\t[$]8, [$]63, [$]59, ",-"SLD",-"SRD"
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
return ((x >> 2) << 5) >> 8
}
cmd/compile: simiplify arm64 bitfield optimizations In some rewrite rules for arm64 bitfield optimizations, the bitfield lsb value and the bitfield width value are related to datasize, some of them use datasize directly to check the bitfield lsb value is valid, to get the bitfiled width value, but some of them call isARM64BFMask() and arm64BFWidth() functions. In order to be consistent, this patch changes them all to use datasize. Besides, this patch sorts the codegen test cases. Run the "toolstash-check -all" command and find one inconsistent code is as the following. new: src/math/fma.go:104 BEQ 247 master: src/math/fma.go:104 BEQ 248 The above inconsistence is due to this patch changing the range of the field lsb value in "UBFIZ" optimization rules from "lc+(32|16|8)<64" to "lc<64", so that the following code is generated as "UBFIZ". The logical of changed code is still correct. The code of src/math/fma.go:160: const uvinf = 0x7FF0000000000000 func FMA(a, b uint32) float64 { ps := a+b return Float64frombits(uint64(ps)<<63 | uvinf) } The new assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 UBFIZ $63, R0, $1, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) The master assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 MOVWU R0, R0 LSL $63, R0, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) Change-Id: I9061104adfdfd3384d0525327ae1e5c8b0df5c35 Reviewed-on: https://go-review.googlesource.com/c/go/+/265038 Trust: fannie zhang <Fannie.Zhang@arm.com> Run-TryBot: fannie zhang <Fannie.Zhang@arm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2020-10-21 04:51:42 -06:00
func ubfx14(x uint64) uint64 {
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
// arm64:"UBFX\t[$]1, R[0-9]+, [$]19",-"LSL",-"LSR"
cmd/compile: optimize shift pairs and masks on s390x Optimize combinations of left and right shifts by a constant value into a 'rotate then insert selected bits [into zero]' instruction. Use the same instruction for contiguous masks since it has some benefits over 'and immediate' (not restricted to 32-bits, does not overwrite source register). To keep the complexity of this change under control I've only implemented 64 bit operations for now. There are a lot more optimizations that can be done with this instruction family. However, since their function overlaps with other instructions we need to be somewhat careful not to break existing optimization rules by creating optimization dead ends. This is particularly true of the load/store merging rules which contain lots of zero extensions and shifts. This CL does interfere with the store merging rules when an operand is shifted left before it is stored: binary.BigEndian.PutUint64(b, x << 1) This is unfortunate but it's not critical and somewhat complex so I plan to fix that in a follow up CL. file before after Δ % addr2line 4117446 4117282 -164 -0.004% api 4945184 4942752 -2432 -0.049% asm 4998079 4991891 -6188 -0.124% buildid 2685158 2684074 -1084 -0.040% cgo 4553732 4553394 -338 -0.007% compile 19294446 19245070 -49376 -0.256% cover 4897105 4891319 -5786 -0.118% dist 3544389 3542785 -1604 -0.045% doc 3926795 3927617 +822 +0.021% fix 3302958 3293868 -9090 -0.275% link 6546274 6543456 -2818 -0.043% nm 4102021 4100825 -1196 -0.029% objdump 4542431 4548483 +6052 +0.133% pack 2482465 2416389 -66076 -2.662% pprof 13366541 13363915 -2626 -0.020% test2json 2829007 2761515 -67492 -2.386% trace 10216164 10219684 +3520 +0.034% vet 6773956 6773572 -384 -0.006% total 107124151 106917891 -206260 -0.193% Change-Id: I7591cce41e06867ba10a745daae9333513062746 Reviewed-on: https://go-review.googlesource.com/c/go/+/233317 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org> Trust: Michael Munday <mike.munday@ibm.com>
2020-05-11 10:44:48 -06:00
// s390x:"RISBGZ\t[$]45, [$]63, [$]63, ",-"SLD",-"SRD",-"AND"
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX, UBFIZ and UBFX opcodes. go1 benchmarks results on Amberwing: name old time/op new time/op delta FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10) Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10) FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9) FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10) FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8) FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9) GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10) JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10) RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10) RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10) RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9) TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9) RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10) GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10) RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9) BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10) HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10) Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9) Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9) Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10) FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10) JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10) Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10) TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10) There are some opportunities to improve this change further by adding patterns to match the "extended register" versions of ADD/SUB/CMP, but I think that should be evaluated on its own. The regressions in Template and TimeFormat would likely be recovered by this, as they seem to be due to generating: ubfiz x0, x0, #3, #8 add x1, x2, x0 instead of add x1, x2, x0, lsl #3 Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b Reviewed-on: https://go-review.googlesource.com/88355 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 14:15:39 -07:00
return ((x & 0xfffff) << 3) >> 4
}
// merge ubfx and zero-extension into ubfx.
cmd/compile: simiplify arm64 bitfield optimizations In some rewrite rules for arm64 bitfield optimizations, the bitfield lsb value and the bitfield width value are related to datasize, some of them use datasize directly to check the bitfield lsb value is valid, to get the bitfiled width value, but some of them call isARM64BFMask() and arm64BFWidth() functions. In order to be consistent, this patch changes them all to use datasize. Besides, this patch sorts the codegen test cases. Run the "toolstash-check -all" command and find one inconsistent code is as the following. new: src/math/fma.go:104 BEQ 247 master: src/math/fma.go:104 BEQ 248 The above inconsistence is due to this patch changing the range of the field lsb value in "UBFIZ" optimization rules from "lc+(32|16|8)<64" to "lc<64", so that the following code is generated as "UBFIZ". The logical of changed code is still correct. The code of src/math/fma.go:160: const uvinf = 0x7FF0000000000000 func FMA(a, b uint32) float64 { ps := a+b return Float64frombits(uint64(ps)<<63 | uvinf) } The new assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 UBFIZ $63, R0, $1, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) The master assembly code: TEXT "".FMA(SB), LEAF|NOFRAME|ABIInternal, $0-16 MOVWU "".a(FP), R0 MOVWU "".b+4(FP), R1 ADD R1, R0, R0 MOVWU R0, R0 LSL $63, R0, R0 ORR $9218868437227405312, R0, R0 MOVD R0, "".~r2+8(FP) RET (R30) Change-Id: I9061104adfdfd3384d0525327ae1e5c8b0df5c35 Reviewed-on: https://go-review.googlesource.com/c/go/+/265038 Trust: fannie zhang <Fannie.Zhang@arm.com> Run-TryBot: fannie zhang <Fannie.Zhang@arm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2020-10-21 04:51:42 -06:00
func ubfx15(x uint64) bool {
midr := x + 10
part_num := uint16((midr >> 4) & 0xfff)
if part_num == 0xd0c { // arm64:"UBFX\t[$]4, R[0-9]+, [$]12",-"MOVHU\tR[0-9]+, R[0-9]+"
return true
}
return false
}
// merge ANDconst and ubfx into ubfx
func ubfx16(x uint64) uint64 {
// arm64:"UBFX\t[$]4, R[0-9]+, [$]6",-"AND\t[$]63"
return ((x >> 3) & 0xfff) >> 1 & 0x3f
}
// Check that we don't emit comparisons for constant shifts.
//
//go:nosplit
func shift_no_cmp(x int) int {
// arm64:`LSL\t[$]17`,-`CMP`
// mips64:`SLLV\t[$]17`,-`SGT`
return x << 17
}
func rev16(c uint64) (uint64, uint64, uint64) {
// arm64:`REV16`,-`AND`,-`LSR`,-`AND`,-`ORR\tR[0-9]+<<8`
b1 := ((c & 0xff00ff00ff00ff00) >> 8) | ((c & 0x00ff00ff00ff00ff) << 8)
// arm64:-`ADD\tR[0-9]+<<8`
b2 := ((c & 0xff00ff00ff00ff00) >> 8) + ((c & 0x00ff00ff00ff00ff) << 8)
// arm64:-`EOR\tR[0-9]+<<8`
b3 := ((c & 0xff00ff00ff00ff00) >> 8) ^ ((c & 0x00ff00ff00ff00ff) << 8)
return b1, b2, b3
}
func rev16w(c uint32) (uint32, uint32, uint32) {
// arm64:`REV16W`,-`AND`,-`UBFX`,-`AND`,-`ORR\tR[0-9]+<<8`
b1 := ((c & 0xff00ff00) >> 8) | ((c & 0x00ff00ff) << 8)
// arm64:-`ADD\tR[0-9]+<<8`
b2 := ((c & 0xff00ff00) >> 8) + ((c & 0x00ff00ff) << 8)
// arm64:-`EOR\tR[0-9]+<<8`
b3 := ((c & 0xff00ff00) >> 8) ^ ((c & 0x00ff00ff) << 8)
return b1, b2, b3
}
func shift(x uint32, y uint16, z uint8) uint64 {
// arm64:-`MOVWU`,-`LSR\t[$]32`
a := uint64(x) >> 32
// arm64:-`MOVHU
b := uint64(y) >> 16
// arm64:-`MOVBU`
c := uint64(z) >> 8
// arm64:`MOVD\tZR`,-`ADD\tR[0-9]+>>16`,-`ADD\tR[0-9]+>>8`,
return a + b + c
}