2018-03-02 07:16:27 -07:00
|
|
|
// asmcheck
|
|
|
|
|
|
|
|
// Copyright 2018 The Go Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
|
|
|
|
package codegen
|
|
|
|
|
|
|
|
import "math/bits"
|
|
|
|
|
2018-03-05 11:46:18 -07:00
|
|
|
// ----------------------- //
|
|
|
|
// bits.LeadingZeros //
|
|
|
|
// ----------------------- //
|
|
|
|
|
|
|
|
func LeadingZeros(n uint) int {
|
2022-03-30 07:44:44 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSRQ"
|
|
|
|
// amd64/v3:"LZCNTQ", -"BSRQ"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR"
|
|
|
|
// arm:"CLZ" arm64:"CLZ"
|
|
|
|
// mips:"CLZ"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Clz"
|
2022-06-14 08:24:32 -06:00
|
|
|
// ppc64le:"CNTLZD"
|
|
|
|
// ppc64:"CNTLZD"
|
2018-03-05 11:46:18 -07:00
|
|
|
return bits.LeadingZeros(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func LeadingZeros64(n uint64) int {
|
2022-03-30 07:44:44 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSRQ"
|
|
|
|
// amd64/v3:"LZCNTQ", -"BSRQ"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR"
|
|
|
|
// arm:"CLZ" arm64:"CLZ"
|
|
|
|
// mips:"CLZ"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Clz"
|
2022-06-14 08:24:32 -06:00
|
|
|
// ppc64le:"CNTLZD"
|
|
|
|
// ppc64:"CNTLZD"
|
2018-03-05 11:46:18 -07:00
|
|
|
return bits.LeadingZeros64(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func LeadingZeros32(n uint32) int {
|
2022-03-30 07:44:44 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSRQ","LEAQ",-"CMOVQEQ"
|
|
|
|
// amd64/v3: "LZCNTL",- "BSRL"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR"
|
cmd/compile: optimize math/bits Len32 intrinsic on arm64
Arm64 has a 32-bit CLZ instruction CLZW, which can be used for intrinsic Len32.
Function LeadingZeros32 calls Len32, with this change, the assembly code of
LeadingZeros32 becomes more concise.
Go code:
func f32(x uint32) { z = bits.LeadingZeros32(x) }
Before:
"".f32 STEXT size=32 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f32(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0004 00004 (test.go:7) MOVWU "".x(FP), R0
0x0008 00008 ($GOROOT/src/math/bits/bits.go:30) CLZ R0, R0
0x000c 00012 ($GOROOT/src/math/bits/bits.go:30) SUB $32, R0, R0
0x0010 00016 (test.go:7) MOVD R0, "".z(SB)
0x001c 00028 (test.go:7) RET (R30)
After:
"".f32 STEXT size=32 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f32(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0004 00004 (test.go:7) MOVWU "".x(FP), R0
0x0008 00008 ($GOROOT/src/math/bits/bits.go:30) CLZW R0, R0
0x000c 00012 (test.go:7) MOVD R0, "".z(SB)
0x0018 00024 (test.go:7) RET (R30)
Benchmarks:
name old time/op new time/op delta
LeadingZeros-8 2.53ns ± 0% 2.55ns ± 0% +0.67% (p=0.000 n=10+10)
LeadingZeros8-8 3.56ns ± 0% 3.56ns ± 0% ~ (all equal)
LeadingZeros16-8 3.55ns ± 0% 3.56ns ± 0% ~ (p=0.465 n=10+10)
LeadingZeros32-8 3.55ns ± 0% 2.96ns ± 0% -16.71% (p=0.000 n=10+7)
LeadingZeros64-8 2.53ns ± 0% 2.54ns ± 0% ~ (p=0.059 n=8+10)
Change-Id: Ie5666bb82909e341060e02ffd4e86c0e5d67e90a
Reviewed-on: https://go-review.googlesource.com/c/157000
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2019-01-02 02:14:26 -07:00
|
|
|
// arm:"CLZ" arm64:"CLZW"
|
2018-03-08 09:43:55 -07:00
|
|
|
// mips:"CLZ"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Clz"
|
2022-06-14 08:24:32 -06:00
|
|
|
// ppc64le:"CNTLZW"
|
|
|
|
// ppc64:"CNTLZW"
|
2018-03-05 11:46:18 -07:00
|
|
|
return bits.LeadingZeros32(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func LeadingZeros16(n uint16) int {
|
2022-03-30 07:44:44 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSRL","LEAL",-"CMOVQEQ"
|
|
|
|
// amd64/v3: "LZCNTL",- "BSRL"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR"
|
|
|
|
// arm:"CLZ" arm64:"CLZ"
|
|
|
|
// mips:"CLZ"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Clz"
|
2022-06-14 08:24:32 -06:00
|
|
|
// ppc64le:"CNTLZD"
|
|
|
|
// ppc64:"CNTLZD"
|
2018-03-05 11:46:18 -07:00
|
|
|
return bits.LeadingZeros16(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func LeadingZeros8(n uint8) int {
|
2022-03-30 07:44:44 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSRL","LEAL",-"CMOVQEQ"
|
|
|
|
// amd64/v3: "LZCNTL",- "BSRL"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR"
|
|
|
|
// arm:"CLZ" arm64:"CLZ"
|
|
|
|
// mips:"CLZ"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Clz"
|
2022-06-14 08:24:32 -06:00
|
|
|
// ppc64le:"CNTLZD"
|
|
|
|
// ppc64:"CNTLZD"
|
2018-03-05 11:46:18 -07:00
|
|
|
return bits.LeadingZeros8(n)
|
|
|
|
}
|
|
|
|
|
2018-03-02 07:16:27 -07:00
|
|
|
// --------------- //
|
|
|
|
// bits.Len* //
|
|
|
|
// --------------- //
|
|
|
|
|
|
|
|
func Len(n uint) int {
|
2022-03-30 07:44:44 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSRQ"
|
|
|
|
// amd64/v3: "LZCNTQ"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR"
|
|
|
|
// arm:"CLZ" arm64:"CLZ"
|
|
|
|
// mips:"CLZ"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Clz"
|
2022-06-14 08:24:32 -06:00
|
|
|
// ppc64le:"SUBC","CNTLZD"
|
|
|
|
// ppc64:"SUBC","CNTLZD"
|
2018-03-02 07:16:27 -07:00
|
|
|
return bits.Len(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func Len64(n uint64) int {
|
2022-03-30 07:44:44 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSRQ"
|
|
|
|
// amd64/v3: "LZCNTQ"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR"
|
|
|
|
// arm:"CLZ" arm64:"CLZ"
|
|
|
|
// mips:"CLZ"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Clz"
|
2020-08-20 14:06:23 -06:00
|
|
|
// ppc64le:"SUBC","CNTLZD"
|
|
|
|
// ppc64:"SUBC","CNTLZD"
|
2018-03-02 07:16:27 -07:00
|
|
|
return bits.Len64(n)
|
|
|
|
}
|
|
|
|
|
2020-08-20 14:06:23 -06:00
|
|
|
func SubFromLen64(n uint64) int {
|
|
|
|
// ppc64le:"CNTLZD",-"SUBC"
|
|
|
|
// ppc64:"CNTLZD",-"SUBC"
|
|
|
|
return 64 - bits.Len64(n)
|
|
|
|
}
|
|
|
|
|
2018-03-02 07:16:27 -07:00
|
|
|
func Len32(n uint32) int {
|
2022-03-30 07:44:44 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSRQ","LEAQ",-"CMOVQEQ"
|
|
|
|
// amd64/v3: "LZCNTL"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR"
|
|
|
|
// arm:"CLZ" arm64:"CLZ"
|
|
|
|
// mips:"CLZ"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Clz"
|
2021-10-14 09:48:08 -06:00
|
|
|
// ppc64: "CNTLZW"
|
|
|
|
// ppc64le: "CNTLZW"
|
2018-03-02 07:16:27 -07:00
|
|
|
return bits.Len32(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func Len16(n uint16) int {
|
2022-03-30 07:44:44 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSRL","LEAL",-"CMOVQEQ"
|
|
|
|
// amd64/v3: "LZCNTL"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR"
|
|
|
|
// arm:"CLZ" arm64:"CLZ"
|
|
|
|
// mips:"CLZ"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Clz"
|
2022-06-14 08:24:32 -06:00
|
|
|
// ppc64le:"SUBC","CNTLZD"
|
|
|
|
// ppc64:"SUBC","CNTLZD"
|
2018-03-02 07:16:27 -07:00
|
|
|
return bits.Len16(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func Len8(n uint8) int {
|
2022-03-30 07:44:44 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSRL","LEAL",-"CMOVQEQ"
|
|
|
|
// amd64/v3: "LZCNTL"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR"
|
|
|
|
// arm:"CLZ" arm64:"CLZ"
|
|
|
|
// mips:"CLZ"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Clz"
|
2022-06-14 08:24:32 -06:00
|
|
|
// ppc64le:"SUBC","CNTLZD"
|
|
|
|
// ppc64:"SUBC","CNTLZD"
|
2018-03-02 07:16:27 -07:00
|
|
|
return bits.Len8(n)
|
|
|
|
}
|
2018-03-06 01:39:14 -07:00
|
|
|
|
2018-03-06 04:55:41 -07:00
|
|
|
// -------------------- //
|
|
|
|
// bits.OnesCount //
|
|
|
|
// -------------------- //
|
|
|
|
|
2021-09-15 01:31:05 -06:00
|
|
|
// TODO(register args) Restore a m d 6 4 / v 1 :.*x86HasPOPCNT when only one ABI is tested.
|
2018-03-06 04:55:41 -07:00
|
|
|
func OnesCount(n uint) int {
|
2021-09-15 01:31:05 -06:00
|
|
|
// amd64/v2:-".*x86HasPOPCNT" amd64/v3:-".*x86HasPOPCNT"
|
2019-12-19 11:58:28 -07:00
|
|
|
// amd64:"POPCNTQ"
|
2018-03-08 09:43:55 -07:00
|
|
|
// arm64:"VCNT","VUADDLV"
|
cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x
This CL implements the math/bits.OnesCount{8,16,32,64} functions
as intrinsics on s390x using the 'population count' (popcnt)
instruction. This instruction was released as the 'population-count'
facility which uses the same facility bit (45) as the
'distinct-operands' facility which is a pre-requisite for Go on
s390x. We can therefore use it without a feature check.
The s390x popcnt instruction treats a 64 bit register as a vector
of 8 bytes, summing the number of ones in each byte individually.
It then writes the results to the corresponding bytes in the
output register. Therefore to implement OnesCount{16,32,64} we
need to sum the individual byte counts using some extra
instructions. To do this efficiently I've added some additional
pseudo operations to the s390x SSA backend.
Unlike other architectures the new instruction sequence is faster
for OnesCount8, so that is implemented using the intrinsic.
name old time/op new time/op delta
OnesCount 3.21ns ± 1% 1.35ns ± 0% -58.00% (p=0.000 n=20+20)
OnesCount8 0.91ns ± 1% 0.81ns ± 0% -11.43% (p=0.000 n=20+20)
OnesCount16 1.51ns ± 3% 1.21ns ± 0% -19.71% (p=0.000 n=20+17)
OnesCount32 1.91ns ± 0% 1.12ns ± 1% -41.60% (p=0.000 n=19+20)
OnesCount64 3.18ns ± 4% 1.35ns ± 0% -57.52% (p=0.000 n=20+20)
Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0
Reviewed-on: https://go-review.googlesource.com/114675
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2018-05-25 10:54:58 -06:00
|
|
|
// s390x:"POPCNT"
|
2018-10-12 12:56:12 -06:00
|
|
|
// ppc64:"POPCNTD"
|
2018-10-15 10:53:07 -06:00
|
|
|
// ppc64le:"POPCNTD"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Popcnt"
|
2018-03-06 04:55:41 -07:00
|
|
|
return bits.OnesCount(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func OnesCount64(n uint64) int {
|
2021-09-15 01:31:05 -06:00
|
|
|
// amd64/v2:-".*x86HasPOPCNT" amd64/v3:-".*x86HasPOPCNT"
|
2019-12-19 11:58:28 -07:00
|
|
|
// amd64:"POPCNTQ"
|
2018-03-08 09:43:55 -07:00
|
|
|
// arm64:"VCNT","VUADDLV"
|
cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x
This CL implements the math/bits.OnesCount{8,16,32,64} functions
as intrinsics on s390x using the 'population count' (popcnt)
instruction. This instruction was released as the 'population-count'
facility which uses the same facility bit (45) as the
'distinct-operands' facility which is a pre-requisite for Go on
s390x. We can therefore use it without a feature check.
The s390x popcnt instruction treats a 64 bit register as a vector
of 8 bytes, summing the number of ones in each byte individually.
It then writes the results to the corresponding bytes in the
output register. Therefore to implement OnesCount{16,32,64} we
need to sum the individual byte counts using some extra
instructions. To do this efficiently I've added some additional
pseudo operations to the s390x SSA backend.
Unlike other architectures the new instruction sequence is faster
for OnesCount8, so that is implemented using the intrinsic.
name old time/op new time/op delta
OnesCount 3.21ns ± 1% 1.35ns ± 0% -58.00% (p=0.000 n=20+20)
OnesCount8 0.91ns ± 1% 0.81ns ± 0% -11.43% (p=0.000 n=20+20)
OnesCount16 1.51ns ± 3% 1.21ns ± 0% -19.71% (p=0.000 n=20+17)
OnesCount32 1.91ns ± 0% 1.12ns ± 1% -41.60% (p=0.000 n=19+20)
OnesCount64 3.18ns ± 4% 1.35ns ± 0% -57.52% (p=0.000 n=20+20)
Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0
Reviewed-on: https://go-review.googlesource.com/114675
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2018-05-25 10:54:58 -06:00
|
|
|
// s390x:"POPCNT"
|
2018-10-12 12:56:12 -06:00
|
|
|
// ppc64:"POPCNTD"
|
2018-10-15 10:53:07 -06:00
|
|
|
// ppc64le:"POPCNTD"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Popcnt"
|
2018-03-06 04:55:41 -07:00
|
|
|
return bits.OnesCount64(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func OnesCount32(n uint32) int {
|
2021-09-15 01:31:05 -06:00
|
|
|
// amd64/v2:-".*x86HasPOPCNT" amd64/v3:-".*x86HasPOPCNT"
|
2019-12-19 11:58:28 -07:00
|
|
|
// amd64:"POPCNTL"
|
2018-03-08 09:43:55 -07:00
|
|
|
// arm64:"VCNT","VUADDLV"
|
cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x
This CL implements the math/bits.OnesCount{8,16,32,64} functions
as intrinsics on s390x using the 'population count' (popcnt)
instruction. This instruction was released as the 'population-count'
facility which uses the same facility bit (45) as the
'distinct-operands' facility which is a pre-requisite for Go on
s390x. We can therefore use it without a feature check.
The s390x popcnt instruction treats a 64 bit register as a vector
of 8 bytes, summing the number of ones in each byte individually.
It then writes the results to the corresponding bytes in the
output register. Therefore to implement OnesCount{16,32,64} we
need to sum the individual byte counts using some extra
instructions. To do this efficiently I've added some additional
pseudo operations to the s390x SSA backend.
Unlike other architectures the new instruction sequence is faster
for OnesCount8, so that is implemented using the intrinsic.
name old time/op new time/op delta
OnesCount 3.21ns ± 1% 1.35ns ± 0% -58.00% (p=0.000 n=20+20)
OnesCount8 0.91ns ± 1% 0.81ns ± 0% -11.43% (p=0.000 n=20+20)
OnesCount16 1.51ns ± 3% 1.21ns ± 0% -19.71% (p=0.000 n=20+17)
OnesCount32 1.91ns ± 0% 1.12ns ± 1% -41.60% (p=0.000 n=19+20)
OnesCount64 3.18ns ± 4% 1.35ns ± 0% -57.52% (p=0.000 n=20+20)
Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0
Reviewed-on: https://go-review.googlesource.com/114675
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2018-05-25 10:54:58 -06:00
|
|
|
// s390x:"POPCNT"
|
2018-10-12 12:56:12 -06:00
|
|
|
// ppc64:"POPCNTW"
|
2018-10-15 10:53:07 -06:00
|
|
|
// ppc64le:"POPCNTW"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Popcnt"
|
2018-03-06 04:55:41 -07:00
|
|
|
return bits.OnesCount32(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func OnesCount16(n uint16) int {
|
2021-09-15 01:31:05 -06:00
|
|
|
// amd64/v2:-".*x86HasPOPCNT" amd64/v3:-".*x86HasPOPCNT"
|
2019-12-19 11:58:28 -07:00
|
|
|
// amd64:"POPCNTL"
|
2018-03-08 09:43:55 -07:00
|
|
|
// arm64:"VCNT","VUADDLV"
|
cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x
This CL implements the math/bits.OnesCount{8,16,32,64} functions
as intrinsics on s390x using the 'population count' (popcnt)
instruction. This instruction was released as the 'population-count'
facility which uses the same facility bit (45) as the
'distinct-operands' facility which is a pre-requisite for Go on
s390x. We can therefore use it without a feature check.
The s390x popcnt instruction treats a 64 bit register as a vector
of 8 bytes, summing the number of ones in each byte individually.
It then writes the results to the corresponding bytes in the
output register. Therefore to implement OnesCount{16,32,64} we
need to sum the individual byte counts using some extra
instructions. To do this efficiently I've added some additional
pseudo operations to the s390x SSA backend.
Unlike other architectures the new instruction sequence is faster
for OnesCount8, so that is implemented using the intrinsic.
name old time/op new time/op delta
OnesCount 3.21ns ± 1% 1.35ns ± 0% -58.00% (p=0.000 n=20+20)
OnesCount8 0.91ns ± 1% 0.81ns ± 0% -11.43% (p=0.000 n=20+20)
OnesCount16 1.51ns ± 3% 1.21ns ± 0% -19.71% (p=0.000 n=20+17)
OnesCount32 1.91ns ± 0% 1.12ns ± 1% -41.60% (p=0.000 n=19+20)
OnesCount64 3.18ns ± 4% 1.35ns ± 0% -57.52% (p=0.000 n=20+20)
Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0
Reviewed-on: https://go-review.googlesource.com/114675
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2018-05-25 10:54:58 -06:00
|
|
|
// s390x:"POPCNT"
|
2018-10-12 12:56:12 -06:00
|
|
|
// ppc64:"POPCNTW"
|
2018-10-15 10:53:07 -06:00
|
|
|
// ppc64le:"POPCNTW"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Popcnt"
|
2018-03-06 04:55:41 -07:00
|
|
|
return bits.OnesCount16(n)
|
|
|
|
}
|
|
|
|
|
cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x
This CL implements the math/bits.OnesCount{8,16,32,64} functions
as intrinsics on s390x using the 'population count' (popcnt)
instruction. This instruction was released as the 'population-count'
facility which uses the same facility bit (45) as the
'distinct-operands' facility which is a pre-requisite for Go on
s390x. We can therefore use it without a feature check.
The s390x popcnt instruction treats a 64 bit register as a vector
of 8 bytes, summing the number of ones in each byte individually.
It then writes the results to the corresponding bytes in the
output register. Therefore to implement OnesCount{16,32,64} we
need to sum the individual byte counts using some extra
instructions. To do this efficiently I've added some additional
pseudo operations to the s390x SSA backend.
Unlike other architectures the new instruction sequence is faster
for OnesCount8, so that is implemented using the intrinsic.
name old time/op new time/op delta
OnesCount 3.21ns ± 1% 1.35ns ± 0% -58.00% (p=0.000 n=20+20)
OnesCount8 0.91ns ± 1% 0.81ns ± 0% -11.43% (p=0.000 n=20+20)
OnesCount16 1.51ns ± 3% 1.21ns ± 0% -19.71% (p=0.000 n=20+17)
OnesCount32 1.91ns ± 0% 1.12ns ± 1% -41.60% (p=0.000 n=19+20)
OnesCount64 3.18ns ± 4% 1.35ns ± 0% -57.52% (p=0.000 n=20+20)
Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0
Reviewed-on: https://go-review.googlesource.com/114675
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2018-05-25 10:54:58 -06:00
|
|
|
func OnesCount8(n uint8) int {
|
|
|
|
// s390x:"POPCNT"
|
2018-10-12 12:56:12 -06:00
|
|
|
// ppc64:"POPCNTB"
|
2018-10-15 10:53:07 -06:00
|
|
|
// ppc64le:"POPCNTB"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Popcnt"
|
cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x
This CL implements the math/bits.OnesCount{8,16,32,64} functions
as intrinsics on s390x using the 'population count' (popcnt)
instruction. This instruction was released as the 'population-count'
facility which uses the same facility bit (45) as the
'distinct-operands' facility which is a pre-requisite for Go on
s390x. We can therefore use it without a feature check.
The s390x popcnt instruction treats a 64 bit register as a vector
of 8 bytes, summing the number of ones in each byte individually.
It then writes the results to the corresponding bytes in the
output register. Therefore to implement OnesCount{16,32,64} we
need to sum the individual byte counts using some extra
instructions. To do this efficiently I've added some additional
pseudo operations to the s390x SSA backend.
Unlike other architectures the new instruction sequence is faster
for OnesCount8, so that is implemented using the intrinsic.
name old time/op new time/op delta
OnesCount 3.21ns ± 1% 1.35ns ± 0% -58.00% (p=0.000 n=20+20)
OnesCount8 0.91ns ± 1% 0.81ns ± 0% -11.43% (p=0.000 n=20+20)
OnesCount16 1.51ns ± 3% 1.21ns ± 0% -19.71% (p=0.000 n=20+17)
OnesCount32 1.91ns ± 0% 1.12ns ± 1% -41.60% (p=0.000 n=19+20)
OnesCount64 3.18ns ± 4% 1.35ns ± 0% -57.52% (p=0.000 n=20+20)
Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0
Reviewed-on: https://go-review.googlesource.com/114675
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2018-05-25 10:54:58 -06:00
|
|
|
return bits.OnesCount8(n)
|
|
|
|
}
|
|
|
|
|
2018-03-06 12:10:35 -07:00
|
|
|
// ----------------------- //
|
|
|
|
// bits.ReverseBytes //
|
|
|
|
// ----------------------- //
|
|
|
|
|
|
|
|
func ReverseBytes(n uint) uint {
|
2018-03-08 09:43:55 -07:00
|
|
|
// amd64:"BSWAPQ"
|
|
|
|
// s390x:"MOVDBR"
|
|
|
|
// arm64:"REV"
|
2018-03-06 12:10:35 -07:00
|
|
|
return bits.ReverseBytes(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func ReverseBytes64(n uint64) uint64 {
|
2018-03-08 09:43:55 -07:00
|
|
|
// amd64:"BSWAPQ"
|
|
|
|
// s390x:"MOVDBR"
|
|
|
|
// arm64:"REV"
|
2018-03-06 12:10:35 -07:00
|
|
|
return bits.ReverseBytes64(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func ReverseBytes32(n uint32) uint32 {
|
2018-03-08 09:43:55 -07:00
|
|
|
// amd64:"BSWAPL"
|
|
|
|
// s390x:"MOVWBR"
|
|
|
|
// arm64:"REVW"
|
2018-03-06 12:10:35 -07:00
|
|
|
return bits.ReverseBytes32(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func ReverseBytes16(n uint16) uint16 {
|
2018-03-08 09:43:55 -07:00
|
|
|
// amd64:"ROLW"
|
cmd/compile: add an optimaztion rule for math/bits.ReverseBytes16 on arm64
On amd64 ReverseBytes16 is lowered to a rotate instruction. However arm64 doesn't
have 16-bit rotate instruction, but has a REV16W instruction which can be used
for ReverseBytes16. This CL adds a rule to turn the patterns like (x<<8) | (x>>8)
(the type of x is uint16, and "|" can also be "^" or "+") to a REV16W instruction.
Code:
func reverseBytes16(i uint16) uint16 { return bits.ReverseBytes16(i) }
Before:
0x0004 00004 (test.go:6) MOVHU "".i(FP), R0
0x0008 00008 ($GOROOT/src/math/bits/bits.go:262) UBFX $8, R0, $8, R1
0x000c 00012 ($GOROOT/src/math/bits/bits.go:262) ORR R0<<8, R1, R0
0x0010 00016 (test.go:6) MOVH R0, "".~r1+8(FP)
0x0014 00020 (test.go:6) RET (R30)
After:
0x0000 00000 (test.go:6) MOVHU "".i(FP), R0
0x0004 00004 (test.go:6) REV16W R0, R0
0x0008 00008 (test.go:6) MOVH R0, "".~r1+8(FP)
0x000c 00012 (test.go:6) RET (R30)
Benchmarks:
name old time/op new time/op delta
ReverseBytes-224 1.000000ns +- 0% 1.000000ns +- 0% ~ (all equal)
ReverseBytes16-224 1.500000ns +- 0% 1.000000ns +- 0% -33.33% (p=0.000 n=9+10)
ReverseBytes32-224 1.000000ns +- 0% 1.000000ns +- 0% ~ (all equal)
ReverseBytes64-224 1.000000ns +- 0% 1.000000ns +- 0% ~ (all equal)
Change-Id: I87cd41b2d8e549bf39c601f185d5775bd42d739c
Reviewed-on: https://go-review.googlesource.com/c/157757
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-02-10 23:37:49 -07:00
|
|
|
// arm64:"REV16W",-"UBFX",-"ORR"
|
2019-02-11 02:40:02 -07:00
|
|
|
// arm/5:"SLL","SRL","ORR"
|
|
|
|
// arm/6:"REV16"
|
|
|
|
// arm/7:"REV16"
|
2018-03-06 12:10:35 -07:00
|
|
|
return bits.ReverseBytes16(n)
|
|
|
|
}
|
|
|
|
|
2018-03-08 09:43:55 -07:00
|
|
|
// --------------------- //
|
|
|
|
// bits.RotateLeft //
|
|
|
|
// --------------------- //
|
|
|
|
|
|
|
|
func RotateLeft64(n uint64) uint64 {
|
|
|
|
// amd64:"ROLQ"
|
|
|
|
// arm64:"ROR"
|
|
|
|
// ppc64:"ROTL"
|
2018-10-15 10:53:07 -06:00
|
|
|
// ppc64le:"ROTL"
|
2020-05-11 10:44:48 -06:00
|
|
|
// s390x:"RISBGZ\t[$]0, [$]63, [$]37, "
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Rotl"
|
2018-03-08 09:43:55 -07:00
|
|
|
return bits.RotateLeft64(n, 37)
|
|
|
|
}
|
|
|
|
|
|
|
|
func RotateLeft32(n uint32) uint32 {
|
|
|
|
// amd64:"ROLL" 386:"ROLL"
|
2019-08-28 12:32:10 -06:00
|
|
|
// arm:`MOVW\tR[0-9]+@>23`
|
2018-03-08 09:43:55 -07:00
|
|
|
// arm64:"RORW"
|
|
|
|
// ppc64:"ROTLW"
|
2018-10-15 10:53:07 -06:00
|
|
|
// ppc64le:"ROTLW"
|
2018-09-03 08:47:58 -06:00
|
|
|
// s390x:"RLL"
|
2019-05-17 15:16:38 -06:00
|
|
|
// wasm:"I32Rotl"
|
2018-03-08 09:43:55 -07:00
|
|
|
return bits.RotateLeft32(n, 9)
|
|
|
|
}
|
|
|
|
|
|
|
|
func RotateLeft16(n uint16) uint16 {
|
|
|
|
// amd64:"ROLW" 386:"ROLW"
|
|
|
|
return bits.RotateLeft16(n, 5)
|
|
|
|
}
|
|
|
|
|
|
|
|
func RotateLeft8(n uint8) uint8 {
|
|
|
|
// amd64:"ROLB" 386:"ROLB"
|
|
|
|
return bits.RotateLeft8(n, 5)
|
|
|
|
}
|
|
|
|
|
2018-09-03 08:47:58 -06:00
|
|
|
func RotateLeftVariable(n uint, m int) uint {
|
|
|
|
// amd64:"ROLQ"
|
2018-06-30 00:48:51 -06:00
|
|
|
// arm64:"ROR"
|
2018-09-03 08:47:58 -06:00
|
|
|
// ppc64:"ROTL"
|
2018-10-15 10:53:07 -06:00
|
|
|
// ppc64le:"ROTL"
|
2018-09-03 08:47:58 -06:00
|
|
|
// s390x:"RLLG"
|
2019-05-17 15:16:38 -06:00
|
|
|
// wasm:"I64Rotl"
|
2018-09-03 08:47:58 -06:00
|
|
|
return bits.RotateLeft(n, m)
|
|
|
|
}
|
|
|
|
|
|
|
|
func RotateLeftVariable64(n uint64, m int) uint64 {
|
|
|
|
// amd64:"ROLQ"
|
2018-06-30 00:48:51 -06:00
|
|
|
// arm64:"ROR"
|
2018-09-03 08:47:58 -06:00
|
|
|
// ppc64:"ROTL"
|
2018-10-15 10:53:07 -06:00
|
|
|
// ppc64le:"ROTL"
|
2018-09-03 08:47:58 -06:00
|
|
|
// s390x:"RLLG"
|
2019-05-17 15:16:38 -06:00
|
|
|
// wasm:"I64Rotl"
|
2018-09-03 08:47:58 -06:00
|
|
|
return bits.RotateLeft64(n, m)
|
|
|
|
}
|
|
|
|
|
|
|
|
func RotateLeftVariable32(n uint32, m int) uint32 {
|
2019-08-01 20:20:38 -06:00
|
|
|
// arm:`MOVW\tR[0-9]+@>R[0-9]+`
|
2018-09-03 08:47:58 -06:00
|
|
|
// amd64:"ROLL"
|
2018-06-30 00:48:51 -06:00
|
|
|
// arm64:"RORW"
|
2018-09-03 08:47:58 -06:00
|
|
|
// ppc64:"ROTLW"
|
2018-10-15 10:53:07 -06:00
|
|
|
// ppc64le:"ROTLW"
|
2018-09-03 08:47:58 -06:00
|
|
|
// s390x:"RLL"
|
2019-05-17 15:16:38 -06:00
|
|
|
// wasm:"I32Rotl"
|
2018-09-03 08:47:58 -06:00
|
|
|
return bits.RotateLeft32(n, m)
|
|
|
|
}
|
|
|
|
|
2018-03-06 01:39:14 -07:00
|
|
|
// ------------------------ //
|
|
|
|
// bits.TrailingZeros //
|
|
|
|
// ------------------------ //
|
|
|
|
|
|
|
|
func TrailingZeros(n uint) int {
|
2021-09-29 19:57:04 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSFQ","MOVL\t\\$64","CMOVQEQ"
|
|
|
|
// amd64/v3:"TZCNTQ"
|
2019-03-15 01:49:38 -06:00
|
|
|
// arm:"CLZ"
|
cmd/compile: eliminate unnecessary type conversions in TrailingZeros(16|8) for arm64
This CL eliminates unnecessary type conversion operations: OpZeroExt16to64 and OpZeroExt8to64.
If the input argrument is a nonzero value, then ORconst operation can also be eliminated.
Benchmarks:
name old time/op new time/op delta
TrailingZeros-8 2.75ns ± 0% 2.75ns ± 0% ~ (all equal)
TrailingZeros8-8 3.49ns ± 1% 2.93ns ± 0% -16.00% (p=0.000 n=10+10)
TrailingZeros16-8 3.49ns ± 1% 2.93ns ± 0% -16.05% (p=0.000 n=9+10)
TrailingZeros32-8 2.67ns ± 1% 2.68ns ± 1% ~ (p=0.468 n=10+10)
TrailingZeros64-8 2.67ns ± 1% 2.65ns ± 0% -0.62% (p=0.022 n=10+9)
code:
func f16(x uint) { z = bits.TrailingZeros16(uint16(x)) }
Before:
"".f16 STEXT size=48 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0000 00000 (test.go:7) FUNCDATA ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) PCDATA $2, ZR
0x0000 00000 (test.go:7) PCDATA ZR, ZR
0x0000 00000 (test.go:7) MOVD "".x(FP), R0
0x0004 00004 (test.go:7) MOVHU R0, R0
0x0008 00008 (test.go:7) ORR $65536, R0, R0
0x000c 00012 (test.go:7) RBIT R0, R0
0x0010 00016 (test.go:7) CLZ R0, R0
0x0014 00020 (test.go:7) MOVD R0, "".z(SB)
0x0020 00032 (test.go:7) RET (R30)
This line of code is unnecessary:
0x0004 00004 (test.go:7) MOVHU R0, R0
After:
"".f16 STEXT size=32 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0000 00000 (test.go:7) FUNCDATA ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) PCDATA $2, ZR
0x0000 00000 (test.go:7) PCDATA ZR, ZR
0x0000 00000 (test.go:7) MOVD "".x(FP), R0
0x0004 00004 (test.go:7) ORR $65536, R0, R0
0x0008 00008 (test.go:7) RBITW R0, R0
0x000c 00012 (test.go:7) CLZW R0, R0
0x0010 00016 (test.go:7) MOVD R0, "".z(SB)
0x001c 00028 (test.go:7) RET (R30)
The situation of TrailingZeros8 is similar to TrailingZeros16.
Change-Id: I473bdca06be8460a0be87abbae6fe640017e4c9d
Reviewed-on: https://go-review.googlesource.com/c/go/+/156999
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-01-03 02:25:06 -07:00
|
|
|
// arm64:"RBIT","CLZ"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR"
|
2019-02-08 11:18:12 -07:00
|
|
|
// ppc64/power8:"ANDN","POPCNTD"
|
|
|
|
// ppc64le/power8:"ANDN","POPCNTD"
|
|
|
|
// ppc64/power9: "CNTTZD"
|
|
|
|
// ppc64le/power9: "CNTTZD"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Ctz"
|
2018-03-06 01:39:14 -07:00
|
|
|
return bits.TrailingZeros(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func TrailingZeros64(n uint64) int {
|
2021-09-29 19:57:04 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSFQ","MOVL\t\\$64","CMOVQEQ"
|
|
|
|
// amd64/v3:"TZCNTQ"
|
cmd/compile: eliminate unnecessary type conversions in TrailingZeros(16|8) for arm64
This CL eliminates unnecessary type conversion operations: OpZeroExt16to64 and OpZeroExt8to64.
If the input argrument is a nonzero value, then ORconst operation can also be eliminated.
Benchmarks:
name old time/op new time/op delta
TrailingZeros-8 2.75ns ± 0% 2.75ns ± 0% ~ (all equal)
TrailingZeros8-8 3.49ns ± 1% 2.93ns ± 0% -16.00% (p=0.000 n=10+10)
TrailingZeros16-8 3.49ns ± 1% 2.93ns ± 0% -16.05% (p=0.000 n=9+10)
TrailingZeros32-8 2.67ns ± 1% 2.68ns ± 1% ~ (p=0.468 n=10+10)
TrailingZeros64-8 2.67ns ± 1% 2.65ns ± 0% -0.62% (p=0.022 n=10+9)
code:
func f16(x uint) { z = bits.TrailingZeros16(uint16(x)) }
Before:
"".f16 STEXT size=48 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0000 00000 (test.go:7) FUNCDATA ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) PCDATA $2, ZR
0x0000 00000 (test.go:7) PCDATA ZR, ZR
0x0000 00000 (test.go:7) MOVD "".x(FP), R0
0x0004 00004 (test.go:7) MOVHU R0, R0
0x0008 00008 (test.go:7) ORR $65536, R0, R0
0x000c 00012 (test.go:7) RBIT R0, R0
0x0010 00016 (test.go:7) CLZ R0, R0
0x0014 00020 (test.go:7) MOVD R0, "".z(SB)
0x0020 00032 (test.go:7) RET (R30)
This line of code is unnecessary:
0x0004 00004 (test.go:7) MOVHU R0, R0
After:
"".f16 STEXT size=32 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0000 00000 (test.go:7) FUNCDATA ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) PCDATA $2, ZR
0x0000 00000 (test.go:7) PCDATA ZR, ZR
0x0000 00000 (test.go:7) MOVD "".x(FP), R0
0x0004 00004 (test.go:7) ORR $65536, R0, R0
0x0008 00008 (test.go:7) RBITW R0, R0
0x000c 00012 (test.go:7) CLZW R0, R0
0x0010 00016 (test.go:7) MOVD R0, "".z(SB)
0x001c 00028 (test.go:7) RET (R30)
The situation of TrailingZeros8 is similar to TrailingZeros16.
Change-Id: I473bdca06be8460a0be87abbae6fe640017e4c9d
Reviewed-on: https://go-review.googlesource.com/c/go/+/156999
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-01-03 02:25:06 -07:00
|
|
|
// arm64:"RBIT","CLZ"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR"
|
2019-02-08 11:18:12 -07:00
|
|
|
// ppc64/power8:"ANDN","POPCNTD"
|
|
|
|
// ppc64le/power8:"ANDN","POPCNTD"
|
|
|
|
// ppc64/power9: "CNTTZD"
|
|
|
|
// ppc64le/power9: "CNTTZD"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Ctz"
|
2018-03-06 01:39:14 -07:00
|
|
|
return bits.TrailingZeros64(n)
|
|
|
|
}
|
|
|
|
|
2020-08-20 14:06:23 -06:00
|
|
|
func TrailingZeros64Subtract(n uint64) int {
|
|
|
|
// ppc64le/power8:"NEG","SUBC","ANDN","POPCNTD"
|
|
|
|
// ppc64le/power9:"SUBC","CNTTZD"
|
|
|
|
return bits.TrailingZeros64(1 - n)
|
|
|
|
}
|
|
|
|
|
2018-03-06 01:39:14 -07:00
|
|
|
func TrailingZeros32(n uint32) int {
|
2021-09-29 19:57:04 -06:00
|
|
|
// amd64/v1,amd64/v2:"BTSQ\\t\\$32","BSFQ"
|
|
|
|
// amd64/v3:"TZCNTL"
|
2019-03-15 01:49:38 -06:00
|
|
|
// arm:"CLZ"
|
cmd/compile: eliminate unnecessary type conversions in TrailingZeros(16|8) for arm64
This CL eliminates unnecessary type conversion operations: OpZeroExt16to64 and OpZeroExt8to64.
If the input argrument is a nonzero value, then ORconst operation can also be eliminated.
Benchmarks:
name old time/op new time/op delta
TrailingZeros-8 2.75ns ± 0% 2.75ns ± 0% ~ (all equal)
TrailingZeros8-8 3.49ns ± 1% 2.93ns ± 0% -16.00% (p=0.000 n=10+10)
TrailingZeros16-8 3.49ns ± 1% 2.93ns ± 0% -16.05% (p=0.000 n=9+10)
TrailingZeros32-8 2.67ns ± 1% 2.68ns ± 1% ~ (p=0.468 n=10+10)
TrailingZeros64-8 2.67ns ± 1% 2.65ns ± 0% -0.62% (p=0.022 n=10+9)
code:
func f16(x uint) { z = bits.TrailingZeros16(uint16(x)) }
Before:
"".f16 STEXT size=48 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0000 00000 (test.go:7) FUNCDATA ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) PCDATA $2, ZR
0x0000 00000 (test.go:7) PCDATA ZR, ZR
0x0000 00000 (test.go:7) MOVD "".x(FP), R0
0x0004 00004 (test.go:7) MOVHU R0, R0
0x0008 00008 (test.go:7) ORR $65536, R0, R0
0x000c 00012 (test.go:7) RBIT R0, R0
0x0010 00016 (test.go:7) CLZ R0, R0
0x0014 00020 (test.go:7) MOVD R0, "".z(SB)
0x0020 00032 (test.go:7) RET (R30)
This line of code is unnecessary:
0x0004 00004 (test.go:7) MOVHU R0, R0
After:
"".f16 STEXT size=32 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0000 00000 (test.go:7) FUNCDATA ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) PCDATA $2, ZR
0x0000 00000 (test.go:7) PCDATA ZR, ZR
0x0000 00000 (test.go:7) MOVD "".x(FP), R0
0x0004 00004 (test.go:7) ORR $65536, R0, R0
0x0008 00008 (test.go:7) RBITW R0, R0
0x000c 00012 (test.go:7) CLZW R0, R0
0x0010 00016 (test.go:7) MOVD R0, "".z(SB)
0x001c 00028 (test.go:7) RET (R30)
The situation of TrailingZeros8 is similar to TrailingZeros16.
Change-Id: I473bdca06be8460a0be87abbae6fe640017e4c9d
Reviewed-on: https://go-review.googlesource.com/c/go/+/156999
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-01-03 02:25:06 -07:00
|
|
|
// arm64:"RBITW","CLZW"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR","MOVWZ"
|
2019-02-08 11:18:12 -07:00
|
|
|
// ppc64/power8:"ANDN","POPCNTW"
|
|
|
|
// ppc64le/power8:"ANDN","POPCNTW"
|
|
|
|
// ppc64/power9: "CNTTZW"
|
|
|
|
// ppc64le/power9: "CNTTZW"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Ctz"
|
2018-03-06 01:39:14 -07:00
|
|
|
return bits.TrailingZeros32(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func TrailingZeros16(n uint16) int {
|
cmd/compile: optimize TrailingZeros(8|16) on amd64
Introduce Ctz8 and Ctz16 ops and provide optimized lowerings for them.
amd64 only for this CL, although it wouldn't surprise me
if other architectures also admit of optimized lowerings.
name old time/op new time/op delta
TrailingZeros8-8 1.33ns ± 6% 0.84ns ± 3% -36.90% (p=0.000 n=20+20)
TrailingZeros16-8 1.26ns ± 5% 0.84ns ± 5% -33.50% (p=0.000 n=20+18)
Code:
func f8(x uint8) { z = bits.TrailingZeros8(x) }
func f16(x uint16) { z = bits.TrailingZeros16(x) }
Before:
"".f8 STEXT nosplit size=34 args=0x8 locals=0x0
0x0000 00000 (x.go:7) TEXT "".f8(SB), NOSPLIT, $0-8
0x0000 00000 (x.go:7) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
0x0000 00000 (x.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (x.go:7) MOVBLZX "".x+8(SP), AX
0x0005 00005 (x.go:7) MOVBLZX AL, AX
0x0008 00008 (x.go:7) BTSQ $8, AX
0x000d 00013 (x.go:7) BSFQ AX, AX
0x0011 00017 (x.go:7) MOVL $64, CX
0x0016 00022 (x.go:7) CMOVQEQ CX, AX
0x001a 00026 (x.go:7) MOVQ AX, "".z(SB)
0x0021 00033 (x.go:7) RET
"".f16 STEXT nosplit size=34 args=0x8 locals=0x0
0x0000 00000 (x.go:8) TEXT "".f16(SB), NOSPLIT, $0-8
0x0000 00000 (x.go:8) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
0x0000 00000 (x.go:8) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (x.go:8) MOVWLZX "".x+8(SP), AX
0x0005 00005 (x.go:8) MOVWLZX AX, AX
0x0008 00008 (x.go:8) BTSQ $16, AX
0x000d 00013 (x.go:8) BSFQ AX, AX
0x0011 00017 (x.go:8) MOVL $64, CX
0x0016 00022 (x.go:8) CMOVQEQ CX, AX
0x001a 00026 (x.go:8) MOVQ AX, "".z(SB)
0x0021 00033 (x.go:8) RET
After:
"".f8 STEXT nosplit size=20 args=0x8 locals=0x0
0x0000 00000 (x.go:7) TEXT "".f8(SB), NOSPLIT, $0-8
0x0000 00000 (x.go:7) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
0x0000 00000 (x.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (x.go:7) MOVBLZX "".x+8(SP), AX
0x0005 00005 (x.go:7) BTSL $8, AX
0x0009 00009 (x.go:7) BSFL AX, AX
0x000c 00012 (x.go:7) MOVQ AX, "".z(SB)
0x0013 00019 (x.go:7) RET
"".f16 STEXT nosplit size=20 args=0x8 locals=0x0
0x0000 00000 (x.go:8) TEXT "".f16(SB), NOSPLIT, $0-8
0x0000 00000 (x.go:8) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
0x0000 00000 (x.go:8) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (x.go:8) MOVWLZX "".x+8(SP), AX
0x0005 00005 (x.go:8) BTSL $16, AX
0x0009 00009 (x.go:8) BSFL AX, AX
0x000c 00012 (x.go:8) MOVQ AX, "".z(SB)
0x0013 00019 (x.go:8) RET
Change-Id: I0551e357348de2b724737d569afd6ac9f5c3aa11
Reviewed-on: https://go-review.googlesource.com/108940
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Giovanni Bajo <rasky@develer.com>
Reviewed-by: Keith Randall <khr@golang.org>
2018-04-23 15:46:41 -06:00
|
|
|
// amd64:"BSFL","BTSL\\t\\$16"
|
2019-08-30 00:24:58 -06:00
|
|
|
// 386:"BSFL\t"
|
2019-03-15 01:49:38 -06:00
|
|
|
// arm:"ORR\t\\$65536","CLZ",-"MOVHU\tR"
|
cmd/compile: eliminate unnecessary type conversions in TrailingZeros(16|8) for arm64
This CL eliminates unnecessary type conversion operations: OpZeroExt16to64 and OpZeroExt8to64.
If the input argrument is a nonzero value, then ORconst operation can also be eliminated.
Benchmarks:
name old time/op new time/op delta
TrailingZeros-8 2.75ns ± 0% 2.75ns ± 0% ~ (all equal)
TrailingZeros8-8 3.49ns ± 1% 2.93ns ± 0% -16.00% (p=0.000 n=10+10)
TrailingZeros16-8 3.49ns ± 1% 2.93ns ± 0% -16.05% (p=0.000 n=9+10)
TrailingZeros32-8 2.67ns ± 1% 2.68ns ± 1% ~ (p=0.468 n=10+10)
TrailingZeros64-8 2.67ns ± 1% 2.65ns ± 0% -0.62% (p=0.022 n=10+9)
code:
func f16(x uint) { z = bits.TrailingZeros16(uint16(x)) }
Before:
"".f16 STEXT size=48 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0000 00000 (test.go:7) FUNCDATA ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) PCDATA $2, ZR
0x0000 00000 (test.go:7) PCDATA ZR, ZR
0x0000 00000 (test.go:7) MOVD "".x(FP), R0
0x0004 00004 (test.go:7) MOVHU R0, R0
0x0008 00008 (test.go:7) ORR $65536, R0, R0
0x000c 00012 (test.go:7) RBIT R0, R0
0x0010 00016 (test.go:7) CLZ R0, R0
0x0014 00020 (test.go:7) MOVD R0, "".z(SB)
0x0020 00032 (test.go:7) RET (R30)
This line of code is unnecessary:
0x0004 00004 (test.go:7) MOVHU R0, R0
After:
"".f16 STEXT size=32 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0000 00000 (test.go:7) FUNCDATA ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) PCDATA $2, ZR
0x0000 00000 (test.go:7) PCDATA ZR, ZR
0x0000 00000 (test.go:7) MOVD "".x(FP), R0
0x0004 00004 (test.go:7) ORR $65536, R0, R0
0x0008 00008 (test.go:7) RBITW R0, R0
0x000c 00012 (test.go:7) CLZW R0, R0
0x0010 00016 (test.go:7) MOVD R0, "".z(SB)
0x001c 00028 (test.go:7) RET (R30)
The situation of TrailingZeros8 is similar to TrailingZeros16.
Change-Id: I473bdca06be8460a0be87abbae6fe640017e4c9d
Reviewed-on: https://go-review.googlesource.com/c/go/+/156999
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-01-03 02:25:06 -07:00
|
|
|
// arm64:"ORR\t\\$65536","RBITW","CLZW",-"MOVHU\tR",-"RBIT\t",-"CLZ\t"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR","OR\t\\$65536"
|
2019-02-08 11:18:12 -07:00
|
|
|
// ppc64/power8:"POPCNTD","OR\\t\\$65536"
|
|
|
|
// ppc64le/power8:"POPCNTD","OR\\t\\$65536"
|
|
|
|
// ppc64/power9:"CNTTZD","OR\\t\\$65536"
|
|
|
|
// ppc64le/power9:"CNTTZD","OR\\t\\$65536"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Ctz"
|
2018-03-06 01:39:14 -07:00
|
|
|
return bits.TrailingZeros16(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func TrailingZeros8(n uint8) int {
|
cmd/compile: optimize TrailingZeros(8|16) on amd64
Introduce Ctz8 and Ctz16 ops and provide optimized lowerings for them.
amd64 only for this CL, although it wouldn't surprise me
if other architectures also admit of optimized lowerings.
name old time/op new time/op delta
TrailingZeros8-8 1.33ns ± 6% 0.84ns ± 3% -36.90% (p=0.000 n=20+20)
TrailingZeros16-8 1.26ns ± 5% 0.84ns ± 5% -33.50% (p=0.000 n=20+18)
Code:
func f8(x uint8) { z = bits.TrailingZeros8(x) }
func f16(x uint16) { z = bits.TrailingZeros16(x) }
Before:
"".f8 STEXT nosplit size=34 args=0x8 locals=0x0
0x0000 00000 (x.go:7) TEXT "".f8(SB), NOSPLIT, $0-8
0x0000 00000 (x.go:7) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
0x0000 00000 (x.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (x.go:7) MOVBLZX "".x+8(SP), AX
0x0005 00005 (x.go:7) MOVBLZX AL, AX
0x0008 00008 (x.go:7) BTSQ $8, AX
0x000d 00013 (x.go:7) BSFQ AX, AX
0x0011 00017 (x.go:7) MOVL $64, CX
0x0016 00022 (x.go:7) CMOVQEQ CX, AX
0x001a 00026 (x.go:7) MOVQ AX, "".z(SB)
0x0021 00033 (x.go:7) RET
"".f16 STEXT nosplit size=34 args=0x8 locals=0x0
0x0000 00000 (x.go:8) TEXT "".f16(SB), NOSPLIT, $0-8
0x0000 00000 (x.go:8) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
0x0000 00000 (x.go:8) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (x.go:8) MOVWLZX "".x+8(SP), AX
0x0005 00005 (x.go:8) MOVWLZX AX, AX
0x0008 00008 (x.go:8) BTSQ $16, AX
0x000d 00013 (x.go:8) BSFQ AX, AX
0x0011 00017 (x.go:8) MOVL $64, CX
0x0016 00022 (x.go:8) CMOVQEQ CX, AX
0x001a 00026 (x.go:8) MOVQ AX, "".z(SB)
0x0021 00033 (x.go:8) RET
After:
"".f8 STEXT nosplit size=20 args=0x8 locals=0x0
0x0000 00000 (x.go:7) TEXT "".f8(SB), NOSPLIT, $0-8
0x0000 00000 (x.go:7) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
0x0000 00000 (x.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (x.go:7) MOVBLZX "".x+8(SP), AX
0x0005 00005 (x.go:7) BTSL $8, AX
0x0009 00009 (x.go:7) BSFL AX, AX
0x000c 00012 (x.go:7) MOVQ AX, "".z(SB)
0x0013 00019 (x.go:7) RET
"".f16 STEXT nosplit size=20 args=0x8 locals=0x0
0x0000 00000 (x.go:8) TEXT "".f16(SB), NOSPLIT, $0-8
0x0000 00000 (x.go:8) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
0x0000 00000 (x.go:8) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (x.go:8) MOVWLZX "".x+8(SP), AX
0x0005 00005 (x.go:8) BTSL $16, AX
0x0009 00009 (x.go:8) BSFL AX, AX
0x000c 00012 (x.go:8) MOVQ AX, "".z(SB)
0x0013 00019 (x.go:8) RET
Change-Id: I0551e357348de2b724737d569afd6ac9f5c3aa11
Reviewed-on: https://go-review.googlesource.com/108940
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Giovanni Bajo <rasky@develer.com>
Reviewed-by: Keith Randall <khr@golang.org>
2018-04-23 15:46:41 -06:00
|
|
|
// amd64:"BSFL","BTSL\\t\\$8"
|
2019-03-15 01:49:38 -06:00
|
|
|
// arm:"ORR\t\\$256","CLZ",-"MOVBU\tR"
|
cmd/compile: eliminate unnecessary type conversions in TrailingZeros(16|8) for arm64
This CL eliminates unnecessary type conversion operations: OpZeroExt16to64 and OpZeroExt8to64.
If the input argrument is a nonzero value, then ORconst operation can also be eliminated.
Benchmarks:
name old time/op new time/op delta
TrailingZeros-8 2.75ns ± 0% 2.75ns ± 0% ~ (all equal)
TrailingZeros8-8 3.49ns ± 1% 2.93ns ± 0% -16.00% (p=0.000 n=10+10)
TrailingZeros16-8 3.49ns ± 1% 2.93ns ± 0% -16.05% (p=0.000 n=9+10)
TrailingZeros32-8 2.67ns ± 1% 2.68ns ± 1% ~ (p=0.468 n=10+10)
TrailingZeros64-8 2.67ns ± 1% 2.65ns ± 0% -0.62% (p=0.022 n=10+9)
code:
func f16(x uint) { z = bits.TrailingZeros16(uint16(x)) }
Before:
"".f16 STEXT size=48 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0000 00000 (test.go:7) FUNCDATA ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) PCDATA $2, ZR
0x0000 00000 (test.go:7) PCDATA ZR, ZR
0x0000 00000 (test.go:7) MOVD "".x(FP), R0
0x0004 00004 (test.go:7) MOVHU R0, R0
0x0008 00008 (test.go:7) ORR $65536, R0, R0
0x000c 00012 (test.go:7) RBIT R0, R0
0x0010 00016 (test.go:7) CLZ R0, R0
0x0014 00020 (test.go:7) MOVD R0, "".z(SB)
0x0020 00032 (test.go:7) RET (R30)
This line of code is unnecessary:
0x0004 00004 (test.go:7) MOVHU R0, R0
After:
"".f16 STEXT size=32 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0000 00000 (test.go:7) FUNCDATA ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) PCDATA $2, ZR
0x0000 00000 (test.go:7) PCDATA ZR, ZR
0x0000 00000 (test.go:7) MOVD "".x(FP), R0
0x0004 00004 (test.go:7) ORR $65536, R0, R0
0x0008 00008 (test.go:7) RBITW R0, R0
0x000c 00012 (test.go:7) CLZW R0, R0
0x0010 00016 (test.go:7) MOVD R0, "".z(SB)
0x001c 00028 (test.go:7) RET (R30)
The situation of TrailingZeros8 is similar to TrailingZeros16.
Change-Id: I473bdca06be8460a0be87abbae6fe640017e4c9d
Reviewed-on: https://go-review.googlesource.com/c/go/+/156999
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-01-03 02:25:06 -07:00
|
|
|
// arm64:"ORR\t\\$256","RBITW","CLZW",-"MOVBU\tR",-"RBIT\t",-"CLZ\t"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR","OR\t\\$256"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Ctz"
|
2018-03-06 01:39:14 -07:00
|
|
|
return bits.TrailingZeros8(n)
|
|
|
|
}
|
2018-04-25 12:52:06 -06:00
|
|
|
|
|
|
|
// IterateBitsNN checks special handling of TrailingZerosNN when the input is known to be non-zero.
|
|
|
|
|
|
|
|
func IterateBits(n uint) int {
|
|
|
|
i := 0
|
|
|
|
for n != 0 {
|
2021-09-29 19:57:04 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSFQ",-"CMOVEQ"
|
|
|
|
// amd64/v3:"TZCNTQ"
|
2018-04-25 12:52:06 -06:00
|
|
|
i += bits.TrailingZeros(n)
|
|
|
|
n &= n - 1
|
|
|
|
}
|
|
|
|
return i
|
|
|
|
}
|
|
|
|
|
|
|
|
func IterateBits64(n uint64) int {
|
|
|
|
i := 0
|
|
|
|
for n != 0 {
|
2021-09-29 19:57:04 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSFQ",-"CMOVEQ"
|
|
|
|
// amd64/v3:"TZCNTQ"
|
2018-04-25 12:52:06 -06:00
|
|
|
i += bits.TrailingZeros64(n)
|
|
|
|
n &= n - 1
|
|
|
|
}
|
|
|
|
return i
|
|
|
|
}
|
|
|
|
|
|
|
|
func IterateBits32(n uint32) int {
|
|
|
|
i := 0
|
|
|
|
for n != 0 {
|
2021-09-29 19:57:04 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSFL",-"BTSQ"
|
|
|
|
// amd64/v3:"TZCNTL"
|
2018-04-25 12:52:06 -06:00
|
|
|
i += bits.TrailingZeros32(n)
|
|
|
|
n &= n - 1
|
|
|
|
}
|
|
|
|
return i
|
|
|
|
}
|
|
|
|
|
|
|
|
func IterateBits16(n uint16) int {
|
|
|
|
i := 0
|
|
|
|
for n != 0 {
|
2021-09-29 19:57:04 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSFL",-"BTSL"
|
|
|
|
// amd64/v3:"TZCNTL"
|
cmd/compile: eliminate unnecessary type conversions in TrailingZeros(16|8) for arm64
This CL eliminates unnecessary type conversion operations: OpZeroExt16to64 and OpZeroExt8to64.
If the input argrument is a nonzero value, then ORconst operation can also be eliminated.
Benchmarks:
name old time/op new time/op delta
TrailingZeros-8 2.75ns ± 0% 2.75ns ± 0% ~ (all equal)
TrailingZeros8-8 3.49ns ± 1% 2.93ns ± 0% -16.00% (p=0.000 n=10+10)
TrailingZeros16-8 3.49ns ± 1% 2.93ns ± 0% -16.05% (p=0.000 n=9+10)
TrailingZeros32-8 2.67ns ± 1% 2.68ns ± 1% ~ (p=0.468 n=10+10)
TrailingZeros64-8 2.67ns ± 1% 2.65ns ± 0% -0.62% (p=0.022 n=10+9)
code:
func f16(x uint) { z = bits.TrailingZeros16(uint16(x)) }
Before:
"".f16 STEXT size=48 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0000 00000 (test.go:7) FUNCDATA ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) PCDATA $2, ZR
0x0000 00000 (test.go:7) PCDATA ZR, ZR
0x0000 00000 (test.go:7) MOVD "".x(FP), R0
0x0004 00004 (test.go:7) MOVHU R0, R0
0x0008 00008 (test.go:7) ORR $65536, R0, R0
0x000c 00012 (test.go:7) RBIT R0, R0
0x0010 00016 (test.go:7) CLZ R0, R0
0x0014 00020 (test.go:7) MOVD R0, "".z(SB)
0x0020 00032 (test.go:7) RET (R30)
This line of code is unnecessary:
0x0004 00004 (test.go:7) MOVHU R0, R0
After:
"".f16 STEXT size=32 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0000 00000 (test.go:7) FUNCDATA ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) PCDATA $2, ZR
0x0000 00000 (test.go:7) PCDATA ZR, ZR
0x0000 00000 (test.go:7) MOVD "".x(FP), R0
0x0004 00004 (test.go:7) ORR $65536, R0, R0
0x0008 00008 (test.go:7) RBITW R0, R0
0x000c 00012 (test.go:7) CLZW R0, R0
0x0010 00016 (test.go:7) MOVD R0, "".z(SB)
0x001c 00028 (test.go:7) RET (R30)
The situation of TrailingZeros8 is similar to TrailingZeros16.
Change-Id: I473bdca06be8460a0be87abbae6fe640017e4c9d
Reviewed-on: https://go-review.googlesource.com/c/go/+/156999
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-01-03 02:25:06 -07:00
|
|
|
// arm64:"RBITW","CLZW",-"ORR"
|
2018-04-25 12:52:06 -06:00
|
|
|
i += bits.TrailingZeros16(n)
|
|
|
|
n &= n - 1
|
|
|
|
}
|
|
|
|
return i
|
|
|
|
}
|
|
|
|
|
|
|
|
func IterateBits8(n uint8) int {
|
|
|
|
i := 0
|
|
|
|
for n != 0 {
|
2021-09-29 19:57:04 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSFL",-"BTSL"
|
|
|
|
// amd64/v3:"TZCNTL"
|
cmd/compile: eliminate unnecessary type conversions in TrailingZeros(16|8) for arm64
This CL eliminates unnecessary type conversion operations: OpZeroExt16to64 and OpZeroExt8to64.
If the input argrument is a nonzero value, then ORconst operation can also be eliminated.
Benchmarks:
name old time/op new time/op delta
TrailingZeros-8 2.75ns ± 0% 2.75ns ± 0% ~ (all equal)
TrailingZeros8-8 3.49ns ± 1% 2.93ns ± 0% -16.00% (p=0.000 n=10+10)
TrailingZeros16-8 3.49ns ± 1% 2.93ns ± 0% -16.05% (p=0.000 n=9+10)
TrailingZeros32-8 2.67ns ± 1% 2.68ns ± 1% ~ (p=0.468 n=10+10)
TrailingZeros64-8 2.67ns ± 1% 2.65ns ± 0% -0.62% (p=0.022 n=10+9)
code:
func f16(x uint) { z = bits.TrailingZeros16(uint16(x)) }
Before:
"".f16 STEXT size=48 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0000 00000 (test.go:7) FUNCDATA ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) PCDATA $2, ZR
0x0000 00000 (test.go:7) PCDATA ZR, ZR
0x0000 00000 (test.go:7) MOVD "".x(FP), R0
0x0004 00004 (test.go:7) MOVHU R0, R0
0x0008 00008 (test.go:7) ORR $65536, R0, R0
0x000c 00012 (test.go:7) RBIT R0, R0
0x0010 00016 (test.go:7) CLZ R0, R0
0x0014 00020 (test.go:7) MOVD R0, "".z(SB)
0x0020 00032 (test.go:7) RET (R30)
This line of code is unnecessary:
0x0004 00004 (test.go:7) MOVHU R0, R0
After:
"".f16 STEXT size=32 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0000 00000 (test.go:7) FUNCDATA ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) PCDATA $2, ZR
0x0000 00000 (test.go:7) PCDATA ZR, ZR
0x0000 00000 (test.go:7) MOVD "".x(FP), R0
0x0004 00004 (test.go:7) ORR $65536, R0, R0
0x0008 00008 (test.go:7) RBITW R0, R0
0x000c 00012 (test.go:7) CLZW R0, R0
0x0010 00016 (test.go:7) MOVD R0, "".z(SB)
0x001c 00028 (test.go:7) RET (R30)
The situation of TrailingZeros8 is similar to TrailingZeros16.
Change-Id: I473bdca06be8460a0be87abbae6fe640017e4c9d
Reviewed-on: https://go-review.googlesource.com/c/go/+/156999
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-01-03 02:25:06 -07:00
|
|
|
// arm64:"RBITW","CLZW",-"ORR"
|
2018-04-25 12:52:06 -06:00
|
|
|
i += bits.TrailingZeros8(n)
|
|
|
|
n &= n - 1
|
|
|
|
}
|
|
|
|
return i
|
|
|
|
}
|
2018-08-14 16:41:22 -06:00
|
|
|
|
2018-10-23 15:05:38 -06:00
|
|
|
// --------------- //
|
|
|
|
// bits.Add* //
|
|
|
|
// --------------- //
|
|
|
|
|
|
|
|
func Add(x, y, ci uint) (r, co uint) {
|
2019-01-14 02:36:18 -07:00
|
|
|
// arm64:"ADDS","ADCS","ADC",-"ADD\t",-"CMP"
|
2018-10-23 15:05:38 -06:00
|
|
|
// amd64:"NEGL","ADCQ","SBBQ","NEGQ"
|
2021-06-08 12:16:01 -06:00
|
|
|
// ppc64: "ADDC", "ADDE", "ADDZE"
|
|
|
|
// ppc64le: "ADDC", "ADDE", "ADDZE"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"ADDE","ADDC\t[$]-1,"
|
2018-10-23 15:05:38 -06:00
|
|
|
return bits.Add(x, y, ci)
|
|
|
|
}
|
|
|
|
|
|
|
|
func AddC(x, ci uint) (r, co uint) {
|
2019-01-14 02:36:18 -07:00
|
|
|
// arm64:"ADDS","ADCS","ADC",-"ADD\t",-"CMP"
|
2018-10-23 15:05:38 -06:00
|
|
|
// amd64:"NEGL","ADCQ","SBBQ","NEGQ"
|
2021-06-08 12:16:01 -06:00
|
|
|
// ppc64: "ADDC", "ADDE", "ADDZE"
|
|
|
|
// ppc64le: "ADDC", "ADDE", "ADDZE"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"ADDE","ADDC\t[$]-1,"
|
2018-10-23 15:05:38 -06:00
|
|
|
return bits.Add(x, 7, ci)
|
|
|
|
}
|
|
|
|
|
|
|
|
func AddZ(x, y uint) (r, co uint) {
|
2019-03-20 21:24:47 -06:00
|
|
|
// arm64:"ADDS","ADC",-"ADCS",-"ADD\t",-"CMP"
|
2018-10-23 15:05:38 -06:00
|
|
|
// amd64:"ADDQ","SBBQ","NEGQ",-"NEGL",-"ADCQ"
|
2021-06-08 12:16:01 -06:00
|
|
|
// ppc64: "ADDC", -"ADDE", "ADDZE"
|
|
|
|
// ppc64le: "ADDC", -"ADDE", "ADDZE"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"ADDC",-"ADDC\t[$]-1,"
|
2018-10-23 15:05:38 -06:00
|
|
|
return bits.Add(x, y, 0)
|
|
|
|
}
|
|
|
|
|
|
|
|
func AddR(x, y, ci uint) uint {
|
2019-01-14 02:36:18 -07:00
|
|
|
// arm64:"ADDS","ADCS",-"ADD\t",-"CMP"
|
2018-10-23 15:05:38 -06:00
|
|
|
// amd64:"NEGL","ADCQ",-"SBBQ",-"NEGQ"
|
2021-06-08 12:16:01 -06:00
|
|
|
// ppc64: "ADDC", "ADDE", -"ADDZE"
|
|
|
|
// ppc64le: "ADDC", "ADDE", -"ADDZE"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"ADDE","ADDC\t[$]-1,"
|
2018-10-23 15:05:38 -06:00
|
|
|
r, _ := bits.Add(x, y, ci)
|
|
|
|
return r
|
|
|
|
}
|
2019-04-30 10:46:23 -06:00
|
|
|
|
2018-10-23 15:05:38 -06:00
|
|
|
func AddM(p, q, r *[3]uint) {
|
|
|
|
var c uint
|
|
|
|
r[0], c = bits.Add(p[0], q[0], c)
|
2019-01-14 02:36:18 -07:00
|
|
|
// arm64:"ADCS",-"ADD\t",-"CMP"
|
2018-10-23 15:05:38 -06:00
|
|
|
// amd64:"ADCQ",-"NEGL",-"SBBQ",-"NEGQ"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"ADDE",-"ADDC\t[$]-1,"
|
2018-10-23 15:05:38 -06:00
|
|
|
r[1], c = bits.Add(p[1], q[1], c)
|
|
|
|
r[2], c = bits.Add(p[2], q[2], c)
|
|
|
|
}
|
|
|
|
|
|
|
|
func Add64(x, y, ci uint64) (r, co uint64) {
|
2019-01-14 02:36:18 -07:00
|
|
|
// arm64:"ADDS","ADCS","ADC",-"ADD\t",-"CMP"
|
2018-10-23 15:05:38 -06:00
|
|
|
// amd64:"NEGL","ADCQ","SBBQ","NEGQ"
|
2019-04-24 11:33:50 -06:00
|
|
|
// ppc64: "ADDC", "ADDE", "ADDZE"
|
|
|
|
// ppc64le: "ADDC", "ADDE", "ADDZE"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"ADDE","ADDC\t[$]-1,"
|
2018-10-23 15:05:38 -06:00
|
|
|
return bits.Add64(x, y, ci)
|
|
|
|
}
|
|
|
|
|
|
|
|
func Add64C(x, ci uint64) (r, co uint64) {
|
2019-01-14 02:36:18 -07:00
|
|
|
// arm64:"ADDS","ADCS","ADC",-"ADD\t",-"CMP"
|
2018-10-23 15:05:38 -06:00
|
|
|
// amd64:"NEGL","ADCQ","SBBQ","NEGQ"
|
2019-04-24 11:33:50 -06:00
|
|
|
// ppc64: "ADDC", "ADDE", "ADDZE"
|
|
|
|
// ppc64le: "ADDC", "ADDE", "ADDZE"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"ADDE","ADDC\t[$]-1,"
|
2018-10-23 15:05:38 -06:00
|
|
|
return bits.Add64(x, 7, ci)
|
|
|
|
}
|
|
|
|
|
|
|
|
func Add64Z(x, y uint64) (r, co uint64) {
|
2019-03-20 21:24:47 -06:00
|
|
|
// arm64:"ADDS","ADC",-"ADCS",-"ADD\t",-"CMP"
|
2018-10-23 15:05:38 -06:00
|
|
|
// amd64:"ADDQ","SBBQ","NEGQ",-"NEGL",-"ADCQ"
|
2021-06-08 12:16:01 -06:00
|
|
|
// ppc64: "ADDC", -"ADDE", "ADDZE"
|
|
|
|
// ppc64le: "ADDC", -"ADDE", "ADDZE"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"ADDC",-"ADDC\t[$]-1,"
|
2018-10-23 15:05:38 -06:00
|
|
|
return bits.Add64(x, y, 0)
|
|
|
|
}
|
|
|
|
|
|
|
|
func Add64R(x, y, ci uint64) uint64 {
|
2019-01-14 02:36:18 -07:00
|
|
|
// arm64:"ADDS","ADCS",-"ADD\t",-"CMP"
|
2018-10-23 15:05:38 -06:00
|
|
|
// amd64:"NEGL","ADCQ",-"SBBQ",-"NEGQ"
|
2021-06-08 12:16:01 -06:00
|
|
|
// ppc64: "ADDC", "ADDE", -"ADDZE"
|
|
|
|
// ppc64le: "ADDC", "ADDE", -"ADDZE"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"ADDE","ADDC\t[$]-1,"
|
2018-10-23 15:05:38 -06:00
|
|
|
r, _ := bits.Add64(x, y, ci)
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
func Add64M(p, q, r *[3]uint64) {
|
|
|
|
var c uint64
|
|
|
|
r[0], c = bits.Add64(p[0], q[0], c)
|
2019-01-14 02:36:18 -07:00
|
|
|
// arm64:"ADCS",-"ADD\t",-"CMP"
|
2018-10-23 15:05:38 -06:00
|
|
|
// amd64:"ADCQ",-"NEGL",-"SBBQ",-"NEGQ"
|
2021-06-08 12:16:01 -06:00
|
|
|
// ppc64: -"ADDC", "ADDE", -"ADDZE"
|
|
|
|
// ppc64le: -"ADDC", "ADDE", -"ADDZE"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"ADDE",-"ADDC\t[$]-1,"
|
2018-10-23 15:05:38 -06:00
|
|
|
r[1], c = bits.Add64(p[1], q[1], c)
|
|
|
|
r[2], c = bits.Add64(p[2], q[2], c)
|
|
|
|
}
|
|
|
|
|
2021-06-08 12:16:01 -06:00
|
|
|
func Add64MSaveC(p, q, r, c *[2]uint64) {
|
|
|
|
// ppc64: "ADDC\tR", "ADDZE"
|
|
|
|
// ppc64le: "ADDC\tR", "ADDZE"
|
|
|
|
r[0], c[0] = bits.Add64(p[0], q[0], 0)
|
|
|
|
// ppc64: "ADDC\t[$]-1", "ADDE", "ADDZE"
|
|
|
|
// ppc64le: "ADDC\t[$]-1", "ADDE", "ADDZE"
|
|
|
|
r[1], c[1] = bits.Add64(p[1], q[1], c[0])
|
|
|
|
}
|
|
|
|
|
2020-02-17 04:43:33 -07:00
|
|
|
func Add64PanicOnOverflowEQ(a, b uint64) uint64 {
|
|
|
|
r, c := bits.Add64(a, b, 0)
|
|
|
|
// s390x:"BRC\t[$]3,",-"ADDE"
|
|
|
|
if c == 1 {
|
|
|
|
panic("overflow")
|
|
|
|
}
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
|
|
|
func Add64PanicOnOverflowNE(a, b uint64) uint64 {
|
|
|
|
r, c := bits.Add64(a, b, 0)
|
|
|
|
// s390x:"BRC\t[$]3,",-"ADDE"
|
|
|
|
if c != 0 {
|
|
|
|
panic("overflow")
|
|
|
|
}
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
|
|
|
func Add64PanicOnOverflowGT(a, b uint64) uint64 {
|
|
|
|
r, c := bits.Add64(a, b, 0)
|
|
|
|
// s390x:"BRC\t[$]3,",-"ADDE"
|
|
|
|
if c > 0 {
|
|
|
|
panic("overflow")
|
|
|
|
}
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
|
|
|
func Add64MPanicOnOverflowEQ(a, b [2]uint64) [2]uint64 {
|
|
|
|
var r [2]uint64
|
|
|
|
var c uint64
|
|
|
|
r[0], c = bits.Add64(a[0], b[0], c)
|
|
|
|
r[1], c = bits.Add64(a[1], b[1], c)
|
|
|
|
// s390x:"BRC\t[$]3,"
|
|
|
|
if c == 1 {
|
|
|
|
panic("overflow")
|
|
|
|
}
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
|
|
|
func Add64MPanicOnOverflowNE(a, b [2]uint64) [2]uint64 {
|
|
|
|
var r [2]uint64
|
|
|
|
var c uint64
|
|
|
|
r[0], c = bits.Add64(a[0], b[0], c)
|
|
|
|
r[1], c = bits.Add64(a[1], b[1], c)
|
|
|
|
// s390x:"BRC\t[$]3,"
|
|
|
|
if c != 0 {
|
|
|
|
panic("overflow")
|
|
|
|
}
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
|
|
|
func Add64MPanicOnOverflowGT(a, b [2]uint64) [2]uint64 {
|
|
|
|
var r [2]uint64
|
|
|
|
var c uint64
|
|
|
|
r[0], c = bits.Add64(a[0], b[0], c)
|
|
|
|
r[1], c = bits.Add64(a[1], b[1], c)
|
|
|
|
// s390x:"BRC\t[$]3,"
|
|
|
|
if c > 0 {
|
|
|
|
panic("overflow")
|
|
|
|
}
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
2018-10-23 15:38:22 -06:00
|
|
|
// --------------- //
|
|
|
|
// bits.Sub* //
|
|
|
|
// --------------- //
|
|
|
|
|
|
|
|
func Sub(x, y, ci uint) (r, co uint) {
|
|
|
|
// amd64:"NEGL","SBBQ","NEGQ"
|
2019-03-20 06:46:20 -06:00
|
|
|
// arm64:"NEGS","SBCS","NGC","NEG",-"ADD",-"SUB",-"CMP"
|
2021-06-08 12:16:01 -06:00
|
|
|
// ppc64:"SUBC", "SUBE", "SUBZE", "NEG"
|
|
|
|
// ppc64le:"SUBC", "SUBE", "SUBZE", "NEG"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"SUBE"
|
2018-10-23 15:38:22 -06:00
|
|
|
return bits.Sub(x, y, ci)
|
|
|
|
}
|
|
|
|
|
|
|
|
func SubC(x, ci uint) (r, co uint) {
|
|
|
|
// amd64:"NEGL","SBBQ","NEGQ"
|
2019-03-20 06:46:20 -06:00
|
|
|
// arm64:"NEGS","SBCS","NGC","NEG",-"ADD",-"SUB",-"CMP"
|
2021-06-08 12:16:01 -06:00
|
|
|
// ppc64:"SUBC", "SUBE", "SUBZE", "NEG"
|
|
|
|
// ppc64le:"SUBC", "SUBE", "SUBZE", "NEG"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"SUBE"
|
2018-10-23 15:38:22 -06:00
|
|
|
return bits.Sub(x, 7, ci)
|
|
|
|
}
|
|
|
|
|
|
|
|
func SubZ(x, y uint) (r, co uint) {
|
|
|
|
// amd64:"SUBQ","SBBQ","NEGQ",-"NEGL"
|
2019-03-20 06:46:20 -06:00
|
|
|
// arm64:"SUBS","NGC","NEG",-"SBCS",-"ADD",-"SUB\t",-"CMP"
|
2021-06-08 12:16:01 -06:00
|
|
|
// ppc64:"SUBC", -"SUBE", "SUBZE", "NEG"
|
|
|
|
// ppc64le:"SUBC", -"SUBE", "SUBZE", "NEG"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"SUBC"
|
2018-10-23 15:38:22 -06:00
|
|
|
return bits.Sub(x, y, 0)
|
|
|
|
}
|
|
|
|
|
|
|
|
func SubR(x, y, ci uint) uint {
|
|
|
|
// amd64:"NEGL","SBBQ",-"NEGQ"
|
2019-03-20 06:46:20 -06:00
|
|
|
// arm64:"NEGS","SBCS",-"NGC",-"NEG\t",-"ADD",-"SUB",-"CMP"
|
2021-06-08 12:16:01 -06:00
|
|
|
// ppc64:"SUBC", "SUBE", -"SUBZE", -"NEG"
|
|
|
|
// ppc64le:"SUBC", "SUBE", -"SUBZE", -"NEG"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"SUBE"
|
2018-10-23 15:38:22 -06:00
|
|
|
r, _ := bits.Sub(x, y, ci)
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
func SubM(p, q, r *[3]uint) {
|
|
|
|
var c uint
|
|
|
|
r[0], c = bits.Sub(p[0], q[0], c)
|
|
|
|
// amd64:"SBBQ",-"NEGL",-"NEGQ"
|
2019-03-20 06:46:20 -06:00
|
|
|
// arm64:"SBCS",-"NEGS",-"NGC",-"NEG",-"ADD",-"SUB",-"CMP"
|
2021-06-08 12:16:01 -06:00
|
|
|
// ppc64:-"SUBC", "SUBE", -"SUBZE", -"NEG"
|
|
|
|
// ppc64le:-"SUBC", "SUBE", -"SUBZE", -"NEG"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"SUBE"
|
2018-10-23 15:38:22 -06:00
|
|
|
r[1], c = bits.Sub(p[1], q[1], c)
|
|
|
|
r[2], c = bits.Sub(p[2], q[2], c)
|
|
|
|
}
|
|
|
|
|
|
|
|
func Sub64(x, y, ci uint64) (r, co uint64) {
|
|
|
|
// amd64:"NEGL","SBBQ","NEGQ"
|
2019-03-20 06:46:20 -06:00
|
|
|
// arm64:"NEGS","SBCS","NGC","NEG",-"ADD",-"SUB",-"CMP"
|
2021-06-08 12:16:01 -06:00
|
|
|
// ppc64:"SUBC", "SUBE", "SUBZE", "NEG"
|
|
|
|
// ppc64le:"SUBC", "SUBE", "SUBZE", "NEG"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"SUBE"
|
2018-10-23 15:38:22 -06:00
|
|
|
return bits.Sub64(x, y, ci)
|
|
|
|
}
|
|
|
|
|
|
|
|
func Sub64C(x, ci uint64) (r, co uint64) {
|
|
|
|
// amd64:"NEGL","SBBQ","NEGQ"
|
2019-03-20 06:46:20 -06:00
|
|
|
// arm64:"NEGS","SBCS","NGC","NEG",-"ADD",-"SUB",-"CMP"
|
2021-06-08 12:16:01 -06:00
|
|
|
// ppc64:"SUBC", "SUBE", "SUBZE", "NEG"
|
|
|
|
// ppc64le:"SUBC", "SUBE", "SUBZE", "NEG"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"SUBE"
|
2018-10-23 15:38:22 -06:00
|
|
|
return bits.Sub64(x, 7, ci)
|
|
|
|
}
|
|
|
|
|
|
|
|
func Sub64Z(x, y uint64) (r, co uint64) {
|
|
|
|
// amd64:"SUBQ","SBBQ","NEGQ",-"NEGL"
|
2019-03-20 06:46:20 -06:00
|
|
|
// arm64:"SUBS","NGC","NEG",-"SBCS",-"ADD",-"SUB\t",-"CMP"
|
2021-06-08 12:16:01 -06:00
|
|
|
// ppc64:"SUBC", -"SUBE", "SUBZE", "NEG"
|
|
|
|
// ppc64le:"SUBC", -"SUBE", "SUBZE", "NEG"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"SUBC"
|
2018-10-23 15:38:22 -06:00
|
|
|
return bits.Sub64(x, y, 0)
|
|
|
|
}
|
|
|
|
|
|
|
|
func Sub64R(x, y, ci uint64) uint64 {
|
|
|
|
// amd64:"NEGL","SBBQ",-"NEGQ"
|
2019-03-20 06:46:20 -06:00
|
|
|
// arm64:"NEGS","SBCS",-"NGC",-"NEG\t",-"ADD",-"SUB",-"CMP"
|
2021-06-08 12:16:01 -06:00
|
|
|
// ppc64:"SUBC", "SUBE", -"SUBZE", -"NEG"
|
|
|
|
// ppc64le:"SUBC", "SUBE", -"SUBZE", -"NEG"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"SUBE"
|
2018-10-23 15:38:22 -06:00
|
|
|
r, _ := bits.Sub64(x, y, ci)
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
func Sub64M(p, q, r *[3]uint64) {
|
|
|
|
var c uint64
|
|
|
|
r[0], c = bits.Sub64(p[0], q[0], c)
|
|
|
|
// amd64:"SBBQ",-"NEGL",-"NEGQ"
|
2019-03-20 06:46:20 -06:00
|
|
|
// arm64:"SBCS",-"NEGS",-"NGC",-"NEG",-"ADD",-"SUB",-"CMP"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"SUBE"
|
2018-10-23 15:38:22 -06:00
|
|
|
r[1], c = bits.Sub64(p[1], q[1], c)
|
|
|
|
r[2], c = bits.Sub64(p[2], q[2], c)
|
|
|
|
}
|
|
|
|
|
2021-06-08 12:16:01 -06:00
|
|
|
func Sub64MSaveC(p, q, r, c *[2]uint64) {
|
|
|
|
// ppc64:"SUBC\tR\\d+, R\\d+,", "SUBZE", "NEG"
|
|
|
|
// ppc64le:"SUBC\tR\\d+, R\\d+,", "SUBZE", "NEG"
|
|
|
|
r[0], c[0] = bits.Sub64(p[0], q[0], 0)
|
|
|
|
// ppc64:"SUBC\tR\\d+, [$]0,", "SUBE", "SUBZE", "NEG"
|
|
|
|
// ppc64le:"SUBC\tR\\d+, [$]0,", "SUBE", "SUBZE", "NEG"
|
|
|
|
r[1], c[1] = bits.Sub64(p[1], q[1], c[0])
|
|
|
|
}
|
|
|
|
|
2020-02-17 04:43:33 -07:00
|
|
|
func Sub64PanicOnOverflowEQ(a, b uint64) uint64 {
|
|
|
|
r, b := bits.Sub64(a, b, 0)
|
|
|
|
// s390x:"BRC\t[$]12,",-"ADDE",-"SUBE"
|
|
|
|
if b == 1 {
|
|
|
|
panic("overflow")
|
|
|
|
}
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
|
|
|
func Sub64PanicOnOverflowNE(a, b uint64) uint64 {
|
|
|
|
r, b := bits.Sub64(a, b, 0)
|
|
|
|
// s390x:"BRC\t[$]12,",-"ADDE",-"SUBE"
|
|
|
|
if b != 0 {
|
|
|
|
panic("overflow")
|
|
|
|
}
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
|
|
|
func Sub64PanicOnOverflowGT(a, b uint64) uint64 {
|
|
|
|
r, b := bits.Sub64(a, b, 0)
|
|
|
|
// s390x:"BRC\t[$]12,",-"ADDE",-"SUBE"
|
|
|
|
if b > 0 {
|
|
|
|
panic("overflow")
|
|
|
|
}
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
|
|
|
func Sub64MPanicOnOverflowEQ(a, b [2]uint64) [2]uint64 {
|
|
|
|
var r [2]uint64
|
|
|
|
var c uint64
|
|
|
|
r[0], c = bits.Sub64(a[0], b[0], c)
|
|
|
|
r[1], c = bits.Sub64(a[1], b[1], c)
|
|
|
|
// s390x:"BRC\t[$]12,"
|
|
|
|
if c == 1 {
|
|
|
|
panic("overflow")
|
|
|
|
}
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
|
|
|
func Sub64MPanicOnOverflowNE(a, b [2]uint64) [2]uint64 {
|
|
|
|
var r [2]uint64
|
|
|
|
var c uint64
|
|
|
|
r[0], c = bits.Sub64(a[0], b[0], c)
|
|
|
|
r[1], c = bits.Sub64(a[1], b[1], c)
|
|
|
|
// s390x:"BRC\t[$]12,"
|
|
|
|
if c != 0 {
|
|
|
|
panic("overflow")
|
|
|
|
}
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
|
|
|
func Sub64MPanicOnOverflowGT(a, b [2]uint64) [2]uint64 {
|
|
|
|
var r [2]uint64
|
|
|
|
var c uint64
|
|
|
|
r[0], c = bits.Sub64(a[0], b[0], c)
|
|
|
|
r[1], c = bits.Sub64(a[1], b[1], c)
|
|
|
|
// s390x:"BRC\t[$]12,"
|
|
|
|
if c > 0 {
|
|
|
|
panic("overflow")
|
|
|
|
}
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
2018-08-14 16:41:22 -06:00
|
|
|
// --------------- //
|
|
|
|
// bits.Mul* //
|
|
|
|
// --------------- //
|
|
|
|
|
|
|
|
func Mul(x, y uint) (hi, lo uint) {
|
|
|
|
// amd64:"MULQ"
|
|
|
|
// arm64:"UMULH","MUL"
|
2018-10-15 10:53:07 -06:00
|
|
|
// ppc64:"MULHDU","MULLD"
|
|
|
|
// ppc64le:"MULHDU","MULLD"
|
2019-09-08 16:50:24 -06:00
|
|
|
// s390x:"MLGR"
|
2019-10-13 04:51:49 -06:00
|
|
|
// mips64: "MULVU"
|
2018-08-14 16:41:22 -06:00
|
|
|
return bits.Mul(x, y)
|
|
|
|
}
|
|
|
|
|
|
|
|
func Mul64(x, y uint64) (hi, lo uint64) {
|
|
|
|
// amd64:"MULQ"
|
|
|
|
// arm64:"UMULH","MUL"
|
2018-10-15 10:53:07 -06:00
|
|
|
// ppc64:"MULHDU","MULLD"
|
|
|
|
// ppc64le:"MULHDU","MULLD"
|
2019-09-08 16:50:24 -06:00
|
|
|
// s390x:"MLGR"
|
2019-10-13 04:51:49 -06:00
|
|
|
// mips64: "MULVU"
|
2021-06-22 05:20:03 -06:00
|
|
|
// riscv64:"MULHU","MUL"
|
2018-08-14 16:41:22 -06:00
|
|
|
return bits.Mul64(x, y)
|
|
|
|
}
|
2018-10-23 20:54:56 -06:00
|
|
|
|
|
|
|
// --------------- //
|
|
|
|
// bits.Div* //
|
|
|
|
// --------------- //
|
|
|
|
|
|
|
|
func Div(hi, lo, x uint) (q, r uint) {
|
|
|
|
// amd64:"DIVQ"
|
|
|
|
return bits.Div(hi, lo, x)
|
|
|
|
}
|
|
|
|
|
cmd/compile: optimize math/bits.Div32 for arm64
Benchmark:
name old time/op new time/op delta
Div-8 22.0ns ± 0% 22.0ns ± 0% ~ (all equal)
Div32-8 6.51ns ± 0% 3.00ns ± 0% -53.90% (p=0.000 n=10+8)
Div64-8 22.5ns ± 0% 22.5ns ± 0% ~ (all equal)
Code:
func div32(hi, lo, y uint32) (q, r uint32) {return bits.Div32(hi, lo, y)}
Before:
0x0020 00032 (test.go:24) MOVWU "".y+8(FP), R0
0x0024 00036 ($GOROOT/src/math/bits/bits.go:472) CBZW R0, 132
0x0028 00040 ($GOROOT/src/math/bits/bits.go:472) MOVWU "".hi(FP), R1
0x002c 00044 ($GOROOT/src/math/bits/bits.go:472) CMPW R1, R0
0x0030 00048 ($GOROOT/src/math/bits/bits.go:472) BLS 96
0x0034 00052 ($GOROOT/src/math/bits/bits.go:475) MOVWU "".lo+4(FP), R2
0x0038 00056 ($GOROOT/src/math/bits/bits.go:475) ORR R1<<32, R2, R1
0x003c 00060 ($GOROOT/src/math/bits/bits.go:476) CBZ R0, 140
0x0040 00064 ($GOROOT/src/math/bits/bits.go:476) UDIV R0, R1, R2
0x0044 00068 (test.go:24) MOVW R2, "".q+16(FP)
0x0048 00072 ($GOROOT/src/math/bits/bits.go:476) UREM R0, R1, R0
0x0050 00080 (test.go:24) MOVW R0, "".r+20(FP)
0x0054 00084 (test.go:24) MOVD -8(RSP), R29
0x0058 00088 (test.go:24) MOVD.P 32(RSP), R30
0x005c 00092 (test.go:24) RET (R30)
After:
0x001c 00028 (test.go:24) MOVWU "".y+8(FP), R0
0x0020 00032 (test.go:24) CBZW R0, 92
0x0024 00036 (test.go:24) MOVWU "".hi(FP), R1
0x0028 00040 (test.go:24) CMPW R0, R1
0x002c 00044 (test.go:24) BHS 84
0x0030 00048 (test.go:24) MOVWU "".lo+4(FP), R2
0x0034 00052 (test.go:24) ORR R1<<32, R2, R4
0x0038 00056 (test.go:24) UDIV R0, R4, R3
0x003c 00060 (test.go:24) MSUB R3, R4, R0, R4
0x0040 00064 (test.go:24) MOVW R3, "".q+16(FP)
0x0044 00068 (test.go:24) MOVW R4, "".r+20(FP)
0x0048 00072 (test.go:24) MOVD -8(RSP), R29
0x004c 00076 (test.go:24) MOVD.P 16(RSP), R30
0x0050 00080 (test.go:24) RET (R30)
UREM instruction in the previous assembly code will be converted to UDIV and MSUB instructions
on arm64. However the UDIV instruction in UREM is unnecessary, because it's a duplicate of the
previous UDIV. This CL adds a rule to have this extra UDIV instruction removed by CSE.
Change-Id: Ie2508784320020b2de022806d09f75a7871bb3d7
Reviewed-on: https://go-review.googlesource.com/c/159577
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Bryan C. Mills <bcmills@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-01-22 02:10:59 -07:00
|
|
|
func Div32(hi, lo, x uint32) (q, r uint32) {
|
|
|
|
// arm64:"ORR","UDIV","MSUB",-"UREM"
|
|
|
|
return bits.Div32(hi, lo, x)
|
|
|
|
}
|
|
|
|
|
2018-10-23 20:54:56 -06:00
|
|
|
func Div64(hi, lo, x uint64) (q, r uint64) {
|
|
|
|
// amd64:"DIVQ"
|
|
|
|
return bits.Div64(hi, lo, x)
|
|
|
|
}
|
2019-04-20 12:09:34 -06:00
|
|
|
|
|
|
|
func Div64degenerate(x uint64) (q, r uint64) {
|
|
|
|
// amd64:-"DIVQ"
|
|
|
|
return bits.Div64(0, x, 5)
|
|
|
|
}
|