2018-03-02 07:16:27 -07:00
|
|
|
// asmcheck
|
|
|
|
|
|
|
|
// Copyright 2018 The Go Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
|
|
|
|
package codegen
|
|
|
|
|
|
|
|
import "math/bits"
|
|
|
|
|
2018-03-05 11:46:18 -07:00
|
|
|
// ----------------------- //
|
|
|
|
// bits.LeadingZeros //
|
|
|
|
// ----------------------- //
|
|
|
|
|
|
|
|
func LeadingZeros(n uint) int {
|
2022-03-30 07:44:44 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSRQ"
|
|
|
|
// amd64/v3:"LZCNTQ", -"BSRQ"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR"
|
|
|
|
// arm:"CLZ" arm64:"CLZ"
|
2024-11-01 20:59:20 -06:00
|
|
|
// loong64:"CLZV",-"SUB"
|
2018-03-08 09:43:55 -07:00
|
|
|
// mips:"CLZ"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Clz"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"CNTLZD"
|
2018-03-05 11:46:18 -07:00
|
|
|
return bits.LeadingZeros(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func LeadingZeros64(n uint64) int {
|
2022-03-30 07:44:44 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSRQ"
|
|
|
|
// amd64/v3:"LZCNTQ", -"BSRQ"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR"
|
|
|
|
// arm:"CLZ" arm64:"CLZ"
|
2024-11-01 20:59:20 -06:00
|
|
|
// loong64:"CLZV",-"SUB"
|
2018-03-08 09:43:55 -07:00
|
|
|
// mips:"CLZ"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Clz"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"CNTLZD"
|
2018-03-05 11:46:18 -07:00
|
|
|
return bits.LeadingZeros64(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func LeadingZeros32(n uint32) int {
|
2022-03-30 07:44:44 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSRQ","LEAQ",-"CMOVQEQ"
|
|
|
|
// amd64/v3: "LZCNTL",- "BSRL"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR"
|
cmd/compile: optimize math/bits Len32 intrinsic on arm64
Arm64 has a 32-bit CLZ instruction CLZW, which can be used for intrinsic Len32.
Function LeadingZeros32 calls Len32, with this change, the assembly code of
LeadingZeros32 becomes more concise.
Go code:
func f32(x uint32) { z = bits.LeadingZeros32(x) }
Before:
"".f32 STEXT size=32 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f32(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0004 00004 (test.go:7) MOVWU "".x(FP), R0
0x0008 00008 ($GOROOT/src/math/bits/bits.go:30) CLZ R0, R0
0x000c 00012 ($GOROOT/src/math/bits/bits.go:30) SUB $32, R0, R0
0x0010 00016 (test.go:7) MOVD R0, "".z(SB)
0x001c 00028 (test.go:7) RET (R30)
After:
"".f32 STEXT size=32 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f32(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0004 00004 (test.go:7) MOVWU "".x(FP), R0
0x0008 00008 ($GOROOT/src/math/bits/bits.go:30) CLZW R0, R0
0x000c 00012 (test.go:7) MOVD R0, "".z(SB)
0x0018 00024 (test.go:7) RET (R30)
Benchmarks:
name old time/op new time/op delta
LeadingZeros-8 2.53ns ± 0% 2.55ns ± 0% +0.67% (p=0.000 n=10+10)
LeadingZeros8-8 3.56ns ± 0% 3.56ns ± 0% ~ (all equal)
LeadingZeros16-8 3.55ns ± 0% 3.56ns ± 0% ~ (p=0.465 n=10+10)
LeadingZeros32-8 3.55ns ± 0% 2.96ns ± 0% -16.71% (p=0.000 n=10+7)
LeadingZeros64-8 2.53ns ± 0% 2.54ns ± 0% ~ (p=0.059 n=8+10)
Change-Id: Ie5666bb82909e341060e02ffd4e86c0e5d67e90a
Reviewed-on: https://go-review.googlesource.com/c/157000
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2019-01-02 02:14:26 -07:00
|
|
|
// arm:"CLZ" arm64:"CLZW"
|
2024-11-01 20:59:20 -06:00
|
|
|
// loong64:"CLZW",-"SUB"
|
2018-03-08 09:43:55 -07:00
|
|
|
// mips:"CLZ"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Clz"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"CNTLZW"
|
2018-03-05 11:46:18 -07:00
|
|
|
return bits.LeadingZeros32(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func LeadingZeros16(n uint16) int {
|
2022-03-30 07:44:44 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSRL","LEAL",-"CMOVQEQ"
|
|
|
|
// amd64/v3: "LZCNTL",- "BSRL"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR"
|
|
|
|
// arm:"CLZ" arm64:"CLZ"
|
2024-11-01 20:59:20 -06:00
|
|
|
// loong64:"CLZV"
|
2018-03-08 09:43:55 -07:00
|
|
|
// mips:"CLZ"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Clz"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"CNTLZD"
|
2018-03-05 11:46:18 -07:00
|
|
|
return bits.LeadingZeros16(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func LeadingZeros8(n uint8) int {
|
2022-03-30 07:44:44 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSRL","LEAL",-"CMOVQEQ"
|
|
|
|
// amd64/v3: "LZCNTL",- "BSRL"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR"
|
|
|
|
// arm:"CLZ" arm64:"CLZ"
|
2024-11-01 20:59:20 -06:00
|
|
|
// loong64:"CLZV"
|
2018-03-08 09:43:55 -07:00
|
|
|
// mips:"CLZ"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Clz"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"CNTLZD"
|
2018-03-05 11:46:18 -07:00
|
|
|
return bits.LeadingZeros8(n)
|
|
|
|
}
|
|
|
|
|
2018-03-02 07:16:27 -07:00
|
|
|
// --------------- //
|
|
|
|
// bits.Len* //
|
|
|
|
// --------------- //
|
|
|
|
|
|
|
|
func Len(n uint) int {
|
2022-03-30 07:44:44 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSRQ"
|
|
|
|
// amd64/v3: "LZCNTQ"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR"
|
|
|
|
// arm:"CLZ" arm64:"CLZ"
|
2024-11-01 20:59:20 -06:00
|
|
|
// loong64:"CLZV"
|
2018-03-08 09:43:55 -07:00
|
|
|
// mips:"CLZ"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Clz"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"SUBC","CNTLZD"
|
2018-03-02 07:16:27 -07:00
|
|
|
return bits.Len(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func Len64(n uint64) int {
|
2022-03-30 07:44:44 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSRQ"
|
|
|
|
// amd64/v3: "LZCNTQ"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR"
|
|
|
|
// arm:"CLZ" arm64:"CLZ"
|
2024-11-01 20:59:20 -06:00
|
|
|
// loong64:"CLZV"
|
2018-03-08 09:43:55 -07:00
|
|
|
// mips:"CLZ"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Clz"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"SUBC","CNTLZD"
|
2018-03-02 07:16:27 -07:00
|
|
|
return bits.Len64(n)
|
|
|
|
}
|
|
|
|
|
2020-08-20 14:06:23 -06:00
|
|
|
func SubFromLen64(n uint64) int {
|
2024-11-01 20:59:20 -06:00
|
|
|
// loong64:"CLZV",-"ADD"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"CNTLZD",-"SUBC"
|
2020-08-20 14:06:23 -06:00
|
|
|
return 64 - bits.Len64(n)
|
|
|
|
}
|
|
|
|
|
2024-11-01 20:59:20 -06:00
|
|
|
func CompareWithLen64(n uint64) bool {
|
|
|
|
// loong64:"CLZV",-"ADD",-"[$]64",-"[$]9"
|
|
|
|
return bits.Len64(n) < 9
|
|
|
|
}
|
|
|
|
|
2018-03-02 07:16:27 -07:00
|
|
|
func Len32(n uint32) int {
|
2022-03-30 07:44:44 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSRQ","LEAQ",-"CMOVQEQ"
|
|
|
|
// amd64/v3: "LZCNTL"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR"
|
|
|
|
// arm:"CLZ" arm64:"CLZ"
|
2024-11-01 20:59:20 -06:00
|
|
|
// loong64:"CLZW"
|
2018-03-08 09:43:55 -07:00
|
|
|
// mips:"CLZ"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Clz"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x: "CNTLZW"
|
2018-03-02 07:16:27 -07:00
|
|
|
return bits.Len32(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func Len16(n uint16) int {
|
2022-03-30 07:44:44 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSRL","LEAL",-"CMOVQEQ"
|
|
|
|
// amd64/v3: "LZCNTL"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR"
|
|
|
|
// arm:"CLZ" arm64:"CLZ"
|
2024-11-01 20:59:20 -06:00
|
|
|
// loong64:"CLZV"
|
2018-03-08 09:43:55 -07:00
|
|
|
// mips:"CLZ"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Clz"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"SUBC","CNTLZD"
|
2018-03-02 07:16:27 -07:00
|
|
|
return bits.Len16(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func Len8(n uint8) int {
|
2022-03-30 07:44:44 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSRL","LEAL",-"CMOVQEQ"
|
|
|
|
// amd64/v3: "LZCNTL"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR"
|
|
|
|
// arm:"CLZ" arm64:"CLZ"
|
2024-11-01 20:59:20 -06:00
|
|
|
// loong64:"CLZV"
|
2018-03-08 09:43:55 -07:00
|
|
|
// mips:"CLZ"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Clz"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"SUBC","CNTLZD"
|
2018-03-02 07:16:27 -07:00
|
|
|
return bits.Len8(n)
|
|
|
|
}
|
2018-03-06 01:39:14 -07:00
|
|
|
|
2018-03-06 04:55:41 -07:00
|
|
|
// -------------------- //
|
|
|
|
// bits.OnesCount //
|
|
|
|
// -------------------- //
|
|
|
|
|
2021-09-15 01:31:05 -06:00
|
|
|
// TODO(register args) Restore a m d 6 4 / v 1 :.*x86HasPOPCNT when only one ABI is tested.
|
2018-03-06 04:55:41 -07:00
|
|
|
func OnesCount(n uint) int {
|
2021-09-15 01:31:05 -06:00
|
|
|
// amd64/v2:-".*x86HasPOPCNT" amd64/v3:-".*x86HasPOPCNT"
|
2019-12-19 11:58:28 -07:00
|
|
|
// amd64:"POPCNTQ"
|
2018-03-08 09:43:55 -07:00
|
|
|
// arm64:"VCNT","VUADDLV"
|
2024-10-18 02:31:29 -06:00
|
|
|
// loong64:"VPCNTV"
|
cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x
This CL implements the math/bits.OnesCount{8,16,32,64} functions
as intrinsics on s390x using the 'population count' (popcnt)
instruction. This instruction was released as the 'population-count'
facility which uses the same facility bit (45) as the
'distinct-operands' facility which is a pre-requisite for Go on
s390x. We can therefore use it without a feature check.
The s390x popcnt instruction treats a 64 bit register as a vector
of 8 bytes, summing the number of ones in each byte individually.
It then writes the results to the corresponding bytes in the
output register. Therefore to implement OnesCount{16,32,64} we
need to sum the individual byte counts using some extra
instructions. To do this efficiently I've added some additional
pseudo operations to the s390x SSA backend.
Unlike other architectures the new instruction sequence is faster
for OnesCount8, so that is implemented using the intrinsic.
name old time/op new time/op delta
OnesCount 3.21ns ± 1% 1.35ns ± 0% -58.00% (p=0.000 n=20+20)
OnesCount8 0.91ns ± 1% 0.81ns ± 0% -11.43% (p=0.000 n=20+20)
OnesCount16 1.51ns ± 3% 1.21ns ± 0% -19.71% (p=0.000 n=20+17)
OnesCount32 1.91ns ± 0% 1.12ns ± 1% -41.60% (p=0.000 n=19+20)
OnesCount64 3.18ns ± 4% 1.35ns ± 0% -57.52% (p=0.000 n=20+20)
Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0
Reviewed-on: https://go-review.googlesource.com/114675
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2018-05-25 10:54:58 -06:00
|
|
|
// s390x:"POPCNT"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"POPCNTD"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Popcnt"
|
2018-03-06 04:55:41 -07:00
|
|
|
return bits.OnesCount(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func OnesCount64(n uint64) int {
|
2021-09-15 01:31:05 -06:00
|
|
|
// amd64/v2:-".*x86HasPOPCNT" amd64/v3:-".*x86HasPOPCNT"
|
2019-12-19 11:58:28 -07:00
|
|
|
// amd64:"POPCNTQ"
|
2018-03-08 09:43:55 -07:00
|
|
|
// arm64:"VCNT","VUADDLV"
|
2024-10-18 02:31:29 -06:00
|
|
|
// loong64:"VPCNTV"
|
cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x
This CL implements the math/bits.OnesCount{8,16,32,64} functions
as intrinsics on s390x using the 'population count' (popcnt)
instruction. This instruction was released as the 'population-count'
facility which uses the same facility bit (45) as the
'distinct-operands' facility which is a pre-requisite for Go on
s390x. We can therefore use it without a feature check.
The s390x popcnt instruction treats a 64 bit register as a vector
of 8 bytes, summing the number of ones in each byte individually.
It then writes the results to the corresponding bytes in the
output register. Therefore to implement OnesCount{16,32,64} we
need to sum the individual byte counts using some extra
instructions. To do this efficiently I've added some additional
pseudo operations to the s390x SSA backend.
Unlike other architectures the new instruction sequence is faster
for OnesCount8, so that is implemented using the intrinsic.
name old time/op new time/op delta
OnesCount 3.21ns ± 1% 1.35ns ± 0% -58.00% (p=0.000 n=20+20)
OnesCount8 0.91ns ± 1% 0.81ns ± 0% -11.43% (p=0.000 n=20+20)
OnesCount16 1.51ns ± 3% 1.21ns ± 0% -19.71% (p=0.000 n=20+17)
OnesCount32 1.91ns ± 0% 1.12ns ± 1% -41.60% (p=0.000 n=19+20)
OnesCount64 3.18ns ± 4% 1.35ns ± 0% -57.52% (p=0.000 n=20+20)
Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0
Reviewed-on: https://go-review.googlesource.com/114675
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2018-05-25 10:54:58 -06:00
|
|
|
// s390x:"POPCNT"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"POPCNTD"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Popcnt"
|
2018-03-06 04:55:41 -07:00
|
|
|
return bits.OnesCount64(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func OnesCount32(n uint32) int {
|
2021-09-15 01:31:05 -06:00
|
|
|
// amd64/v2:-".*x86HasPOPCNT" amd64/v3:-".*x86HasPOPCNT"
|
2019-12-19 11:58:28 -07:00
|
|
|
// amd64:"POPCNTL"
|
2018-03-08 09:43:55 -07:00
|
|
|
// arm64:"VCNT","VUADDLV"
|
2024-10-18 02:31:29 -06:00
|
|
|
// loong64:"VPCNTW"
|
cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x
This CL implements the math/bits.OnesCount{8,16,32,64} functions
as intrinsics on s390x using the 'population count' (popcnt)
instruction. This instruction was released as the 'population-count'
facility which uses the same facility bit (45) as the
'distinct-operands' facility which is a pre-requisite for Go on
s390x. We can therefore use it without a feature check.
The s390x popcnt instruction treats a 64 bit register as a vector
of 8 bytes, summing the number of ones in each byte individually.
It then writes the results to the corresponding bytes in the
output register. Therefore to implement OnesCount{16,32,64} we
need to sum the individual byte counts using some extra
instructions. To do this efficiently I've added some additional
pseudo operations to the s390x SSA backend.
Unlike other architectures the new instruction sequence is faster
for OnesCount8, so that is implemented using the intrinsic.
name old time/op new time/op delta
OnesCount 3.21ns ± 1% 1.35ns ± 0% -58.00% (p=0.000 n=20+20)
OnesCount8 0.91ns ± 1% 0.81ns ± 0% -11.43% (p=0.000 n=20+20)
OnesCount16 1.51ns ± 3% 1.21ns ± 0% -19.71% (p=0.000 n=20+17)
OnesCount32 1.91ns ± 0% 1.12ns ± 1% -41.60% (p=0.000 n=19+20)
OnesCount64 3.18ns ± 4% 1.35ns ± 0% -57.52% (p=0.000 n=20+20)
Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0
Reviewed-on: https://go-review.googlesource.com/114675
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2018-05-25 10:54:58 -06:00
|
|
|
// s390x:"POPCNT"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"POPCNTW"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Popcnt"
|
2018-03-06 04:55:41 -07:00
|
|
|
return bits.OnesCount32(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func OnesCount16(n uint16) int {
|
2021-09-15 01:31:05 -06:00
|
|
|
// amd64/v2:-".*x86HasPOPCNT" amd64/v3:-".*x86HasPOPCNT"
|
2019-12-19 11:58:28 -07:00
|
|
|
// amd64:"POPCNTL"
|
2018-03-08 09:43:55 -07:00
|
|
|
// arm64:"VCNT","VUADDLV"
|
2024-10-18 02:31:29 -06:00
|
|
|
// loong64:"VPCNTH"
|
cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x
This CL implements the math/bits.OnesCount{8,16,32,64} functions
as intrinsics on s390x using the 'population count' (popcnt)
instruction. This instruction was released as the 'population-count'
facility which uses the same facility bit (45) as the
'distinct-operands' facility which is a pre-requisite for Go on
s390x. We can therefore use it without a feature check.
The s390x popcnt instruction treats a 64 bit register as a vector
of 8 bytes, summing the number of ones in each byte individually.
It then writes the results to the corresponding bytes in the
output register. Therefore to implement OnesCount{16,32,64} we
need to sum the individual byte counts using some extra
instructions. To do this efficiently I've added some additional
pseudo operations to the s390x SSA backend.
Unlike other architectures the new instruction sequence is faster
for OnesCount8, so that is implemented using the intrinsic.
name old time/op new time/op delta
OnesCount 3.21ns ± 1% 1.35ns ± 0% -58.00% (p=0.000 n=20+20)
OnesCount8 0.91ns ± 1% 0.81ns ± 0% -11.43% (p=0.000 n=20+20)
OnesCount16 1.51ns ± 3% 1.21ns ± 0% -19.71% (p=0.000 n=20+17)
OnesCount32 1.91ns ± 0% 1.12ns ± 1% -41.60% (p=0.000 n=19+20)
OnesCount64 3.18ns ± 4% 1.35ns ± 0% -57.52% (p=0.000 n=20+20)
Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0
Reviewed-on: https://go-review.googlesource.com/114675
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2018-05-25 10:54:58 -06:00
|
|
|
// s390x:"POPCNT"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"POPCNTW"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Popcnt"
|
2018-03-06 04:55:41 -07:00
|
|
|
return bits.OnesCount16(n)
|
|
|
|
}
|
|
|
|
|
cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x
This CL implements the math/bits.OnesCount{8,16,32,64} functions
as intrinsics on s390x using the 'population count' (popcnt)
instruction. This instruction was released as the 'population-count'
facility which uses the same facility bit (45) as the
'distinct-operands' facility which is a pre-requisite for Go on
s390x. We can therefore use it without a feature check.
The s390x popcnt instruction treats a 64 bit register as a vector
of 8 bytes, summing the number of ones in each byte individually.
It then writes the results to the corresponding bytes in the
output register. Therefore to implement OnesCount{16,32,64} we
need to sum the individual byte counts using some extra
instructions. To do this efficiently I've added some additional
pseudo operations to the s390x SSA backend.
Unlike other architectures the new instruction sequence is faster
for OnesCount8, so that is implemented using the intrinsic.
name old time/op new time/op delta
OnesCount 3.21ns ± 1% 1.35ns ± 0% -58.00% (p=0.000 n=20+20)
OnesCount8 0.91ns ± 1% 0.81ns ± 0% -11.43% (p=0.000 n=20+20)
OnesCount16 1.51ns ± 3% 1.21ns ± 0% -19.71% (p=0.000 n=20+17)
OnesCount32 1.91ns ± 0% 1.12ns ± 1% -41.60% (p=0.000 n=19+20)
OnesCount64 3.18ns ± 4% 1.35ns ± 0% -57.52% (p=0.000 n=20+20)
Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0
Reviewed-on: https://go-review.googlesource.com/114675
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2018-05-25 10:54:58 -06:00
|
|
|
func OnesCount8(n uint8) int {
|
|
|
|
// s390x:"POPCNT"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"POPCNTB"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Popcnt"
|
cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x
This CL implements the math/bits.OnesCount{8,16,32,64} functions
as intrinsics on s390x using the 'population count' (popcnt)
instruction. This instruction was released as the 'population-count'
facility which uses the same facility bit (45) as the
'distinct-operands' facility which is a pre-requisite for Go on
s390x. We can therefore use it without a feature check.
The s390x popcnt instruction treats a 64 bit register as a vector
of 8 bytes, summing the number of ones in each byte individually.
It then writes the results to the corresponding bytes in the
output register. Therefore to implement OnesCount{16,32,64} we
need to sum the individual byte counts using some extra
instructions. To do this efficiently I've added some additional
pseudo operations to the s390x SSA backend.
Unlike other architectures the new instruction sequence is faster
for OnesCount8, so that is implemented using the intrinsic.
name old time/op new time/op delta
OnesCount 3.21ns ± 1% 1.35ns ± 0% -58.00% (p=0.000 n=20+20)
OnesCount8 0.91ns ± 1% 0.81ns ± 0% -11.43% (p=0.000 n=20+20)
OnesCount16 1.51ns ± 3% 1.21ns ± 0% -19.71% (p=0.000 n=20+17)
OnesCount32 1.91ns ± 0% 1.12ns ± 1% -41.60% (p=0.000 n=19+20)
OnesCount64 3.18ns ± 4% 1.35ns ± 0% -57.52% (p=0.000 n=20+20)
Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0
Reviewed-on: https://go-review.googlesource.com/114675
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2018-05-25 10:54:58 -06:00
|
|
|
return bits.OnesCount8(n)
|
|
|
|
}
|
|
|
|
|
2024-11-02 01:40:13 -06:00
|
|
|
// ------------------ //
|
|
|
|
// bits.Reverse //
|
|
|
|
// ------------------ //
|
|
|
|
|
|
|
|
func Reverse(n uint) uint {
|
|
|
|
// loong64:"BITREVV"
|
|
|
|
return bits.Reverse(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func Reverse64(n uint64) uint64 {
|
|
|
|
// loong64:"BITREVV"
|
|
|
|
return bits.Reverse64(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func Reverse32(n uint32) uint32 {
|
|
|
|
// loong64:"BITREVW"
|
|
|
|
return bits.Reverse32(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func Reverse16(n uint16) uint16 {
|
|
|
|
// loong64:"BITREV4B","REVB2H"
|
|
|
|
return bits.Reverse16(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func Reverse8(n uint8) uint8 {
|
|
|
|
// loong64:"BITREV4B"
|
|
|
|
return bits.Reverse8(n)
|
|
|
|
}
|
|
|
|
|
2018-03-06 12:10:35 -07:00
|
|
|
// ----------------------- //
|
|
|
|
// bits.ReverseBytes //
|
|
|
|
// ----------------------- //
|
|
|
|
|
|
|
|
func ReverseBytes(n uint) uint {
|
2018-03-08 09:43:55 -07:00
|
|
|
// amd64:"BSWAPQ"
|
2023-02-06 06:21:52 -07:00
|
|
|
// 386:"BSWAPL"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"MOVDBR"
|
|
|
|
// arm64:"REV"
|
2024-11-02 00:30:31 -06:00
|
|
|
// loong64:"REVBV"
|
2018-03-06 12:10:35 -07:00
|
|
|
return bits.ReverseBytes(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func ReverseBytes64(n uint64) uint64 {
|
2018-03-08 09:43:55 -07:00
|
|
|
// amd64:"BSWAPQ"
|
2023-02-06 06:21:52 -07:00
|
|
|
// 386:"BSWAPL"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"MOVDBR"
|
|
|
|
// arm64:"REV"
|
2022-10-31 10:47:17 -06:00
|
|
|
// ppc64x/power10: "BRD"
|
2024-11-02 00:30:31 -06:00
|
|
|
// loong64:"REVBV"
|
2018-03-06 12:10:35 -07:00
|
|
|
return bits.ReverseBytes64(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func ReverseBytes32(n uint32) uint32 {
|
2018-03-08 09:43:55 -07:00
|
|
|
// amd64:"BSWAPL"
|
2023-02-06 06:21:52 -07:00
|
|
|
// 386:"BSWAPL"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"MOVWBR"
|
|
|
|
// arm64:"REVW"
|
2024-11-02 00:30:31 -06:00
|
|
|
// loong64:"REVB2W"
|
2022-10-31 10:47:17 -06:00
|
|
|
// ppc64x/power10: "BRW"
|
2018-03-06 12:10:35 -07:00
|
|
|
return bits.ReverseBytes32(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func ReverseBytes16(n uint16) uint16 {
|
2018-03-08 09:43:55 -07:00
|
|
|
// amd64:"ROLW"
|
cmd/compile: add an optimaztion rule for math/bits.ReverseBytes16 on arm64
On amd64 ReverseBytes16 is lowered to a rotate instruction. However arm64 doesn't
have 16-bit rotate instruction, but has a REV16W instruction which can be used
for ReverseBytes16. This CL adds a rule to turn the patterns like (x<<8) | (x>>8)
(the type of x is uint16, and "|" can also be "^" or "+") to a REV16W instruction.
Code:
func reverseBytes16(i uint16) uint16 { return bits.ReverseBytes16(i) }
Before:
0x0004 00004 (test.go:6) MOVHU "".i(FP), R0
0x0008 00008 ($GOROOT/src/math/bits/bits.go:262) UBFX $8, R0, $8, R1
0x000c 00012 ($GOROOT/src/math/bits/bits.go:262) ORR R0<<8, R1, R0
0x0010 00016 (test.go:6) MOVH R0, "".~r1+8(FP)
0x0014 00020 (test.go:6) RET (R30)
After:
0x0000 00000 (test.go:6) MOVHU "".i(FP), R0
0x0004 00004 (test.go:6) REV16W R0, R0
0x0008 00008 (test.go:6) MOVH R0, "".~r1+8(FP)
0x000c 00012 (test.go:6) RET (R30)
Benchmarks:
name old time/op new time/op delta
ReverseBytes-224 1.000000ns +- 0% 1.000000ns +- 0% ~ (all equal)
ReverseBytes16-224 1.500000ns +- 0% 1.000000ns +- 0% -33.33% (p=0.000 n=9+10)
ReverseBytes32-224 1.000000ns +- 0% 1.000000ns +- 0% ~ (all equal)
ReverseBytes64-224 1.000000ns +- 0% 1.000000ns +- 0% ~ (all equal)
Change-Id: I87cd41b2d8e549bf39c601f185d5775bd42d739c
Reviewed-on: https://go-review.googlesource.com/c/157757
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-02-10 23:37:49 -07:00
|
|
|
// arm64:"REV16W",-"UBFX",-"ORR"
|
2019-02-11 02:40:02 -07:00
|
|
|
// arm/5:"SLL","SRL","ORR"
|
|
|
|
// arm/6:"REV16"
|
|
|
|
// arm/7:"REV16"
|
2024-11-02 00:30:31 -06:00
|
|
|
// loong64:"REVB2H"
|
2022-10-31 10:47:17 -06:00
|
|
|
// ppc64x/power10: "BRH"
|
2018-03-06 12:10:35 -07:00
|
|
|
return bits.ReverseBytes16(n)
|
|
|
|
}
|
|
|
|
|
2018-03-08 09:43:55 -07:00
|
|
|
// --------------------- //
|
|
|
|
// bits.RotateLeft //
|
|
|
|
// --------------------- //
|
|
|
|
|
|
|
|
func RotateLeft64(n uint64) uint64 {
|
|
|
|
// amd64:"ROLQ"
|
|
|
|
// arm64:"ROR"
|
2024-08-12 03:20:44 -06:00
|
|
|
// loong64:"ROTRV"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"ROTL"
|
2024-05-29 03:47:00 -06:00
|
|
|
// riscv64:"RORI"
|
2020-05-11 10:44:48 -06:00
|
|
|
// s390x:"RISBGZ\t[$]0, [$]63, [$]37, "
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Rotl"
|
2018-03-08 09:43:55 -07:00
|
|
|
return bits.RotateLeft64(n, 37)
|
|
|
|
}
|
|
|
|
|
|
|
|
func RotateLeft32(n uint32) uint32 {
|
|
|
|
// amd64:"ROLL" 386:"ROLL"
|
2019-08-28 12:32:10 -06:00
|
|
|
// arm:`MOVW\tR[0-9]+@>23`
|
2018-03-08 09:43:55 -07:00
|
|
|
// arm64:"RORW"
|
2024-08-12 03:20:44 -06:00
|
|
|
// loong64:"ROTR\t"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"ROTLW"
|
2024-05-29 03:47:00 -06:00
|
|
|
// riscv64:"RORIW"
|
2018-09-03 08:47:58 -06:00
|
|
|
// s390x:"RLL"
|
2019-05-17 15:16:38 -06:00
|
|
|
// wasm:"I32Rotl"
|
2018-03-08 09:43:55 -07:00
|
|
|
return bits.RotateLeft32(n, 9)
|
|
|
|
}
|
|
|
|
|
2022-07-13 03:00:57 -06:00
|
|
|
func RotateLeft16(n uint16, s int) uint16 {
|
2018-03-08 09:43:55 -07:00
|
|
|
// amd64:"ROLW" 386:"ROLW"
|
2022-07-13 03:00:57 -06:00
|
|
|
// arm64:"RORW",-"CSEL"
|
2024-08-12 03:20:44 -06:00
|
|
|
// loong64:"ROTR\t","SLLV"
|
2022-07-13 03:00:57 -06:00
|
|
|
return bits.RotateLeft16(n, s)
|
2018-03-08 09:43:55 -07:00
|
|
|
}
|
|
|
|
|
2022-07-13 03:00:57 -06:00
|
|
|
func RotateLeft8(n uint8, s int) uint8 {
|
2018-03-08 09:43:55 -07:00
|
|
|
// amd64:"ROLB" 386:"ROLB"
|
2022-07-13 03:00:57 -06:00
|
|
|
// arm64:"LSL","LSR",-"CSEL"
|
2024-08-12 03:20:44 -06:00
|
|
|
// loong64:"OR","SLLV","SRLV"
|
2022-07-13 03:00:57 -06:00
|
|
|
return bits.RotateLeft8(n, s)
|
2018-03-08 09:43:55 -07:00
|
|
|
}
|
|
|
|
|
2018-09-03 08:47:58 -06:00
|
|
|
func RotateLeftVariable(n uint, m int) uint {
|
|
|
|
// amd64:"ROLQ"
|
2018-06-30 00:48:51 -06:00
|
|
|
// arm64:"ROR"
|
2024-08-12 03:20:44 -06:00
|
|
|
// loong64:"ROTRV"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"ROTL"
|
2024-05-29 03:47:00 -06:00
|
|
|
// riscv64:"ROL"
|
2018-09-03 08:47:58 -06:00
|
|
|
// s390x:"RLLG"
|
2019-05-17 15:16:38 -06:00
|
|
|
// wasm:"I64Rotl"
|
2018-09-03 08:47:58 -06:00
|
|
|
return bits.RotateLeft(n, m)
|
|
|
|
}
|
|
|
|
|
|
|
|
func RotateLeftVariable64(n uint64, m int) uint64 {
|
|
|
|
// amd64:"ROLQ"
|
2018-06-30 00:48:51 -06:00
|
|
|
// arm64:"ROR"
|
2024-08-12 03:20:44 -06:00
|
|
|
// loong64:"ROTRV"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"ROTL"
|
2024-05-29 03:47:00 -06:00
|
|
|
// riscv64:"ROL"
|
2018-09-03 08:47:58 -06:00
|
|
|
// s390x:"RLLG"
|
2019-05-17 15:16:38 -06:00
|
|
|
// wasm:"I64Rotl"
|
2018-09-03 08:47:58 -06:00
|
|
|
return bits.RotateLeft64(n, m)
|
|
|
|
}
|
|
|
|
|
|
|
|
func RotateLeftVariable32(n uint32, m int) uint32 {
|
2019-08-01 20:20:38 -06:00
|
|
|
// arm:`MOVW\tR[0-9]+@>R[0-9]+`
|
2018-09-03 08:47:58 -06:00
|
|
|
// amd64:"ROLL"
|
2018-06-30 00:48:51 -06:00
|
|
|
// arm64:"RORW"
|
2024-08-12 03:20:44 -06:00
|
|
|
// loong64:"ROTR\t"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"ROTLW"
|
2024-05-29 03:47:00 -06:00
|
|
|
// riscv64:"ROLW"
|
2018-09-03 08:47:58 -06:00
|
|
|
// s390x:"RLL"
|
2019-05-17 15:16:38 -06:00
|
|
|
// wasm:"I32Rotl"
|
2018-09-03 08:47:58 -06:00
|
|
|
return bits.RotateLeft32(n, m)
|
|
|
|
}
|
|
|
|
|
2018-03-06 01:39:14 -07:00
|
|
|
// ------------------------ //
|
|
|
|
// bits.TrailingZeros //
|
|
|
|
// ------------------------ //
|
|
|
|
|
|
|
|
func TrailingZeros(n uint) int {
|
2021-09-29 19:57:04 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSFQ","MOVL\t\\$64","CMOVQEQ"
|
|
|
|
// amd64/v3:"TZCNTQ"
|
2023-03-12 00:34:20 -07:00
|
|
|
// 386:"BSFL"
|
2019-03-15 01:49:38 -06:00
|
|
|
// arm:"CLZ"
|
cmd/compile: eliminate unnecessary type conversions in TrailingZeros(16|8) for arm64
This CL eliminates unnecessary type conversion operations: OpZeroExt16to64 and OpZeroExt8to64.
If the input argrument is a nonzero value, then ORconst operation can also be eliminated.
Benchmarks:
name old time/op new time/op delta
TrailingZeros-8 2.75ns ± 0% 2.75ns ± 0% ~ (all equal)
TrailingZeros8-8 3.49ns ± 1% 2.93ns ± 0% -16.00% (p=0.000 n=10+10)
TrailingZeros16-8 3.49ns ± 1% 2.93ns ± 0% -16.05% (p=0.000 n=9+10)
TrailingZeros32-8 2.67ns ± 1% 2.68ns ± 1% ~ (p=0.468 n=10+10)
TrailingZeros64-8 2.67ns ± 1% 2.65ns ± 0% -0.62% (p=0.022 n=10+9)
code:
func f16(x uint) { z = bits.TrailingZeros16(uint16(x)) }
Before:
"".f16 STEXT size=48 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0000 00000 (test.go:7) FUNCDATA ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) PCDATA $2, ZR
0x0000 00000 (test.go:7) PCDATA ZR, ZR
0x0000 00000 (test.go:7) MOVD "".x(FP), R0
0x0004 00004 (test.go:7) MOVHU R0, R0
0x0008 00008 (test.go:7) ORR $65536, R0, R0
0x000c 00012 (test.go:7) RBIT R0, R0
0x0010 00016 (test.go:7) CLZ R0, R0
0x0014 00020 (test.go:7) MOVD R0, "".z(SB)
0x0020 00032 (test.go:7) RET (R30)
This line of code is unnecessary:
0x0004 00004 (test.go:7) MOVHU R0, R0
After:
"".f16 STEXT size=32 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0000 00000 (test.go:7) FUNCDATA ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) PCDATA $2, ZR
0x0000 00000 (test.go:7) PCDATA ZR, ZR
0x0000 00000 (test.go:7) MOVD "".x(FP), R0
0x0004 00004 (test.go:7) ORR $65536, R0, R0
0x0008 00008 (test.go:7) RBITW R0, R0
0x000c 00012 (test.go:7) CLZW R0, R0
0x0010 00016 (test.go:7) MOVD R0, "".z(SB)
0x001c 00028 (test.go:7) RET (R30)
The situation of TrailingZeros8 is similar to TrailingZeros16.
Change-Id: I473bdca06be8460a0be87abbae6fe640017e4c9d
Reviewed-on: https://go-review.googlesource.com/c/go/+/156999
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-01-03 02:25:06 -07:00
|
|
|
// arm64:"RBIT","CLZ"
|
2024-11-01 02:09:32 -06:00
|
|
|
// loong64:"CTZV"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x/power8:"ANDN","POPCNTD"
|
|
|
|
// ppc64x/power9: "CNTTZD"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Ctz"
|
2018-03-06 01:39:14 -07:00
|
|
|
return bits.TrailingZeros(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func TrailingZeros64(n uint64) int {
|
2021-09-29 19:57:04 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSFQ","MOVL\t\\$64","CMOVQEQ"
|
|
|
|
// amd64/v3:"TZCNTQ"
|
2023-03-12 00:34:20 -07:00
|
|
|
// 386:"BSFL"
|
cmd/compile: eliminate unnecessary type conversions in TrailingZeros(16|8) for arm64
This CL eliminates unnecessary type conversion operations: OpZeroExt16to64 and OpZeroExt8to64.
If the input argrument is a nonzero value, then ORconst operation can also be eliminated.
Benchmarks:
name old time/op new time/op delta
TrailingZeros-8 2.75ns ± 0% 2.75ns ± 0% ~ (all equal)
TrailingZeros8-8 3.49ns ± 1% 2.93ns ± 0% -16.00% (p=0.000 n=10+10)
TrailingZeros16-8 3.49ns ± 1% 2.93ns ± 0% -16.05% (p=0.000 n=9+10)
TrailingZeros32-8 2.67ns ± 1% 2.68ns ± 1% ~ (p=0.468 n=10+10)
TrailingZeros64-8 2.67ns ± 1% 2.65ns ± 0% -0.62% (p=0.022 n=10+9)
code:
func f16(x uint) { z = bits.TrailingZeros16(uint16(x)) }
Before:
"".f16 STEXT size=48 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0000 00000 (test.go:7) FUNCDATA ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) PCDATA $2, ZR
0x0000 00000 (test.go:7) PCDATA ZR, ZR
0x0000 00000 (test.go:7) MOVD "".x(FP), R0
0x0004 00004 (test.go:7) MOVHU R0, R0
0x0008 00008 (test.go:7) ORR $65536, R0, R0
0x000c 00012 (test.go:7) RBIT R0, R0
0x0010 00016 (test.go:7) CLZ R0, R0
0x0014 00020 (test.go:7) MOVD R0, "".z(SB)
0x0020 00032 (test.go:7) RET (R30)
This line of code is unnecessary:
0x0004 00004 (test.go:7) MOVHU R0, R0
After:
"".f16 STEXT size=32 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0000 00000 (test.go:7) FUNCDATA ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) PCDATA $2, ZR
0x0000 00000 (test.go:7) PCDATA ZR, ZR
0x0000 00000 (test.go:7) MOVD "".x(FP), R0
0x0004 00004 (test.go:7) ORR $65536, R0, R0
0x0008 00008 (test.go:7) RBITW R0, R0
0x000c 00012 (test.go:7) CLZW R0, R0
0x0010 00016 (test.go:7) MOVD R0, "".z(SB)
0x001c 00028 (test.go:7) RET (R30)
The situation of TrailingZeros8 is similar to TrailingZeros16.
Change-Id: I473bdca06be8460a0be87abbae6fe640017e4c9d
Reviewed-on: https://go-review.googlesource.com/c/go/+/156999
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-01-03 02:25:06 -07:00
|
|
|
// arm64:"RBIT","CLZ"
|
2024-11-01 02:09:32 -06:00
|
|
|
// loong64:"CTZV"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x/power8:"ANDN","POPCNTD"
|
|
|
|
// ppc64x/power9: "CNTTZD"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Ctz"
|
2018-03-06 01:39:14 -07:00
|
|
|
return bits.TrailingZeros64(n)
|
|
|
|
}
|
|
|
|
|
2020-08-20 14:06:23 -06:00
|
|
|
func TrailingZeros64Subtract(n uint64) int {
|
2023-01-24 10:38:29 -07:00
|
|
|
// ppc64x/power8:"NEG","SUBC","ANDN","POPCNTD"
|
|
|
|
// ppc64x/power9:"SUBC","CNTTZD"
|
2020-08-20 14:06:23 -06:00
|
|
|
return bits.TrailingZeros64(1 - n)
|
|
|
|
}
|
|
|
|
|
2018-03-06 01:39:14 -07:00
|
|
|
func TrailingZeros32(n uint32) int {
|
2021-09-29 19:57:04 -06:00
|
|
|
// amd64/v1,amd64/v2:"BTSQ\\t\\$32","BSFQ"
|
|
|
|
// amd64/v3:"TZCNTL"
|
2023-03-12 00:34:20 -07:00
|
|
|
// 386:"BSFL"
|
2019-03-15 01:49:38 -06:00
|
|
|
// arm:"CLZ"
|
cmd/compile: eliminate unnecessary type conversions in TrailingZeros(16|8) for arm64
This CL eliminates unnecessary type conversion operations: OpZeroExt16to64 and OpZeroExt8to64.
If the input argrument is a nonzero value, then ORconst operation can also be eliminated.
Benchmarks:
name old time/op new time/op delta
TrailingZeros-8 2.75ns ± 0% 2.75ns ± 0% ~ (all equal)
TrailingZeros8-8 3.49ns ± 1% 2.93ns ± 0% -16.00% (p=0.000 n=10+10)
TrailingZeros16-8 3.49ns ± 1% 2.93ns ± 0% -16.05% (p=0.000 n=9+10)
TrailingZeros32-8 2.67ns ± 1% 2.68ns ± 1% ~ (p=0.468 n=10+10)
TrailingZeros64-8 2.67ns ± 1% 2.65ns ± 0% -0.62% (p=0.022 n=10+9)
code:
func f16(x uint) { z = bits.TrailingZeros16(uint16(x)) }
Before:
"".f16 STEXT size=48 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0000 00000 (test.go:7) FUNCDATA ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) PCDATA $2, ZR
0x0000 00000 (test.go:7) PCDATA ZR, ZR
0x0000 00000 (test.go:7) MOVD "".x(FP), R0
0x0004 00004 (test.go:7) MOVHU R0, R0
0x0008 00008 (test.go:7) ORR $65536, R0, R0
0x000c 00012 (test.go:7) RBIT R0, R0
0x0010 00016 (test.go:7) CLZ R0, R0
0x0014 00020 (test.go:7) MOVD R0, "".z(SB)
0x0020 00032 (test.go:7) RET (R30)
This line of code is unnecessary:
0x0004 00004 (test.go:7) MOVHU R0, R0
After:
"".f16 STEXT size=32 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0000 00000 (test.go:7) FUNCDATA ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) PCDATA $2, ZR
0x0000 00000 (test.go:7) PCDATA ZR, ZR
0x0000 00000 (test.go:7) MOVD "".x(FP), R0
0x0004 00004 (test.go:7) ORR $65536, R0, R0
0x0008 00008 (test.go:7) RBITW R0, R0
0x000c 00012 (test.go:7) CLZW R0, R0
0x0010 00016 (test.go:7) MOVD R0, "".z(SB)
0x001c 00028 (test.go:7) RET (R30)
The situation of TrailingZeros8 is similar to TrailingZeros16.
Change-Id: I473bdca06be8460a0be87abbae6fe640017e4c9d
Reviewed-on: https://go-review.googlesource.com/c/go/+/156999
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-01-03 02:25:06 -07:00
|
|
|
// arm64:"RBITW","CLZW"
|
2024-11-01 02:09:32 -06:00
|
|
|
// loong64:"CTZW"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR","MOVWZ"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x/power8:"ANDN","POPCNTW"
|
|
|
|
// ppc64x/power9: "CNTTZW"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Ctz"
|
2018-03-06 01:39:14 -07:00
|
|
|
return bits.TrailingZeros32(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func TrailingZeros16(n uint16) int {
|
2023-08-01 15:32:56 -06:00
|
|
|
// amd64:"BSFL","ORL\\t\\$65536"
|
2019-08-30 00:24:58 -06:00
|
|
|
// 386:"BSFL\t"
|
2019-03-15 01:49:38 -06:00
|
|
|
// arm:"ORR\t\\$65536","CLZ",-"MOVHU\tR"
|
cmd/compile: eliminate unnecessary type conversions in TrailingZeros(16|8) for arm64
This CL eliminates unnecessary type conversion operations: OpZeroExt16to64 and OpZeroExt8to64.
If the input argrument is a nonzero value, then ORconst operation can also be eliminated.
Benchmarks:
name old time/op new time/op delta
TrailingZeros-8 2.75ns ± 0% 2.75ns ± 0% ~ (all equal)
TrailingZeros8-8 3.49ns ± 1% 2.93ns ± 0% -16.00% (p=0.000 n=10+10)
TrailingZeros16-8 3.49ns ± 1% 2.93ns ± 0% -16.05% (p=0.000 n=9+10)
TrailingZeros32-8 2.67ns ± 1% 2.68ns ± 1% ~ (p=0.468 n=10+10)
TrailingZeros64-8 2.67ns ± 1% 2.65ns ± 0% -0.62% (p=0.022 n=10+9)
code:
func f16(x uint) { z = bits.TrailingZeros16(uint16(x)) }
Before:
"".f16 STEXT size=48 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0000 00000 (test.go:7) FUNCDATA ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) PCDATA $2, ZR
0x0000 00000 (test.go:7) PCDATA ZR, ZR
0x0000 00000 (test.go:7) MOVD "".x(FP), R0
0x0004 00004 (test.go:7) MOVHU R0, R0
0x0008 00008 (test.go:7) ORR $65536, R0, R0
0x000c 00012 (test.go:7) RBIT R0, R0
0x0010 00016 (test.go:7) CLZ R0, R0
0x0014 00020 (test.go:7) MOVD R0, "".z(SB)
0x0020 00032 (test.go:7) RET (R30)
This line of code is unnecessary:
0x0004 00004 (test.go:7) MOVHU R0, R0
After:
"".f16 STEXT size=32 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0000 00000 (test.go:7) FUNCDATA ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) PCDATA $2, ZR
0x0000 00000 (test.go:7) PCDATA ZR, ZR
0x0000 00000 (test.go:7) MOVD "".x(FP), R0
0x0004 00004 (test.go:7) ORR $65536, R0, R0
0x0008 00008 (test.go:7) RBITW R0, R0
0x000c 00012 (test.go:7) CLZW R0, R0
0x0010 00016 (test.go:7) MOVD R0, "".z(SB)
0x001c 00028 (test.go:7) RET (R30)
The situation of TrailingZeros8 is similar to TrailingZeros16.
Change-Id: I473bdca06be8460a0be87abbae6fe640017e4c9d
Reviewed-on: https://go-review.googlesource.com/c/go/+/156999
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-01-03 02:25:06 -07:00
|
|
|
// arm64:"ORR\t\\$65536","RBITW","CLZW",-"MOVHU\tR",-"RBIT\t",-"CLZ\t"
|
2024-11-01 02:09:32 -06:00
|
|
|
// loong64:"CTZV"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR","OR\t\\$65536"
|
cmd/internal/obj/ppc64: remove C_UCON optab matching class
This optab matching rule was used to match signed 16 bit values shifted
left by 16 bits. Unsigned 16 bit values greater than 0x7FFF<<16 were
classified as C_U32CON which led to larger than necessary codegen.
Instead, rewrite logical/arithmetic operations in the preprocessor pass
to use the 16 bit shifted immediate operation (e.g ADDIS vs ADD). This
simplifies the optab matching rules, while also minimizing codegen size
for large unsigned values.
Note, ADDIS sign-extends the constant argument, all others do not.
For matching opcodes, this means:
MOVD $is<<16,Rx becomes ADDIS $is,Rx or ORIS $is,Rx
MOVW $is<<16,Rx becomes ADDIS $is,Rx
ADD $is<<16,[Rx,]Ry becomes ADDIS $is[Rx,]Ry
OR $is<<16,[Rx,]Ry becomes ORIS $is[Rx,]Ry
XOR $is<<16,[Rx,]Ry becomes XORIS $is[Rx,]Ry
Change-Id: I1a988d9f52517a04bb8dc2e41d7caf3d5fff867c
Reviewed-on: https://go-review.googlesource.com/c/go/+/536735
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Run-TryBot: Paul Murphy <murp@ibm.com>
Reviewed-by: Heschi Kreinick <heschi@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
2023-10-12 08:25:30 -06:00
|
|
|
// ppc64x/power8:"POPCNTD","ORIS\\t\\$1"
|
|
|
|
// ppc64x/power9:"CNTTZD","ORIS\\t\\$1"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Ctz"
|
2018-03-06 01:39:14 -07:00
|
|
|
return bits.TrailingZeros16(n)
|
|
|
|
}
|
|
|
|
|
|
|
|
func TrailingZeros8(n uint8) int {
|
2023-08-01 15:32:56 -06:00
|
|
|
// amd64:"BSFL","ORL\\t\\$256"
|
2023-03-12 00:34:20 -07:00
|
|
|
// 386:"BSFL"
|
2019-03-15 01:49:38 -06:00
|
|
|
// arm:"ORR\t\\$256","CLZ",-"MOVBU\tR"
|
cmd/compile: eliminate unnecessary type conversions in TrailingZeros(16|8) for arm64
This CL eliminates unnecessary type conversion operations: OpZeroExt16to64 and OpZeroExt8to64.
If the input argrument is a nonzero value, then ORconst operation can also be eliminated.
Benchmarks:
name old time/op new time/op delta
TrailingZeros-8 2.75ns ± 0% 2.75ns ± 0% ~ (all equal)
TrailingZeros8-8 3.49ns ± 1% 2.93ns ± 0% -16.00% (p=0.000 n=10+10)
TrailingZeros16-8 3.49ns ± 1% 2.93ns ± 0% -16.05% (p=0.000 n=9+10)
TrailingZeros32-8 2.67ns ± 1% 2.68ns ± 1% ~ (p=0.468 n=10+10)
TrailingZeros64-8 2.67ns ± 1% 2.65ns ± 0% -0.62% (p=0.022 n=10+9)
code:
func f16(x uint) { z = bits.TrailingZeros16(uint16(x)) }
Before:
"".f16 STEXT size=48 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0000 00000 (test.go:7) FUNCDATA ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) PCDATA $2, ZR
0x0000 00000 (test.go:7) PCDATA ZR, ZR
0x0000 00000 (test.go:7) MOVD "".x(FP), R0
0x0004 00004 (test.go:7) MOVHU R0, R0
0x0008 00008 (test.go:7) ORR $65536, R0, R0
0x000c 00012 (test.go:7) RBIT R0, R0
0x0010 00016 (test.go:7) CLZ R0, R0
0x0014 00020 (test.go:7) MOVD R0, "".z(SB)
0x0020 00032 (test.go:7) RET (R30)
This line of code is unnecessary:
0x0004 00004 (test.go:7) MOVHU R0, R0
After:
"".f16 STEXT size=32 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0000 00000 (test.go:7) FUNCDATA ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) PCDATA $2, ZR
0x0000 00000 (test.go:7) PCDATA ZR, ZR
0x0000 00000 (test.go:7) MOVD "".x(FP), R0
0x0004 00004 (test.go:7) ORR $65536, R0, R0
0x0008 00008 (test.go:7) RBITW R0, R0
0x000c 00012 (test.go:7) CLZW R0, R0
0x0010 00016 (test.go:7) MOVD R0, "".z(SB)
0x001c 00028 (test.go:7) RET (R30)
The situation of TrailingZeros8 is similar to TrailingZeros16.
Change-Id: I473bdca06be8460a0be87abbae6fe640017e4c9d
Reviewed-on: https://go-review.googlesource.com/c/go/+/156999
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-01-03 02:25:06 -07:00
|
|
|
// arm64:"ORR\t\\$256","RBITW","CLZW",-"MOVBU\tR",-"RBIT\t",-"CLZ\t"
|
2024-11-01 02:09:32 -06:00
|
|
|
// loong64:"CTZV"
|
2018-03-08 09:43:55 -07:00
|
|
|
// s390x:"FLOGR","OR\t\\$256"
|
2019-03-04 17:56:17 -07:00
|
|
|
// wasm:"I64Ctz"
|
2018-03-06 01:39:14 -07:00
|
|
|
return bits.TrailingZeros8(n)
|
|
|
|
}
|
2018-04-25 12:52:06 -06:00
|
|
|
|
|
|
|
// IterateBitsNN checks special handling of TrailingZerosNN when the input is known to be non-zero.
|
|
|
|
|
|
|
|
func IterateBits(n uint) int {
|
|
|
|
i := 0
|
|
|
|
for n != 0 {
|
2021-09-29 19:57:04 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSFQ",-"CMOVEQ"
|
|
|
|
// amd64/v3:"TZCNTQ"
|
2018-04-25 12:52:06 -06:00
|
|
|
i += bits.TrailingZeros(n)
|
|
|
|
n &= n - 1
|
|
|
|
}
|
|
|
|
return i
|
|
|
|
}
|
|
|
|
|
|
|
|
func IterateBits64(n uint64) int {
|
|
|
|
i := 0
|
|
|
|
for n != 0 {
|
2021-09-29 19:57:04 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSFQ",-"CMOVEQ"
|
|
|
|
// amd64/v3:"TZCNTQ"
|
2018-04-25 12:52:06 -06:00
|
|
|
i += bits.TrailingZeros64(n)
|
|
|
|
n &= n - 1
|
|
|
|
}
|
|
|
|
return i
|
|
|
|
}
|
|
|
|
|
|
|
|
func IterateBits32(n uint32) int {
|
|
|
|
i := 0
|
|
|
|
for n != 0 {
|
2021-09-29 19:57:04 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSFL",-"BTSQ"
|
|
|
|
// amd64/v3:"TZCNTL"
|
2018-04-25 12:52:06 -06:00
|
|
|
i += bits.TrailingZeros32(n)
|
|
|
|
n &= n - 1
|
|
|
|
}
|
|
|
|
return i
|
|
|
|
}
|
|
|
|
|
|
|
|
func IterateBits16(n uint16) int {
|
|
|
|
i := 0
|
|
|
|
for n != 0 {
|
2021-09-29 19:57:04 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSFL",-"BTSL"
|
|
|
|
// amd64/v3:"TZCNTL"
|
cmd/compile: eliminate unnecessary type conversions in TrailingZeros(16|8) for arm64
This CL eliminates unnecessary type conversion operations: OpZeroExt16to64 and OpZeroExt8to64.
If the input argrument is a nonzero value, then ORconst operation can also be eliminated.
Benchmarks:
name old time/op new time/op delta
TrailingZeros-8 2.75ns ± 0% 2.75ns ± 0% ~ (all equal)
TrailingZeros8-8 3.49ns ± 1% 2.93ns ± 0% -16.00% (p=0.000 n=10+10)
TrailingZeros16-8 3.49ns ± 1% 2.93ns ± 0% -16.05% (p=0.000 n=9+10)
TrailingZeros32-8 2.67ns ± 1% 2.68ns ± 1% ~ (p=0.468 n=10+10)
TrailingZeros64-8 2.67ns ± 1% 2.65ns ± 0% -0.62% (p=0.022 n=10+9)
code:
func f16(x uint) { z = bits.TrailingZeros16(uint16(x)) }
Before:
"".f16 STEXT size=48 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0000 00000 (test.go:7) FUNCDATA ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) PCDATA $2, ZR
0x0000 00000 (test.go:7) PCDATA ZR, ZR
0x0000 00000 (test.go:7) MOVD "".x(FP), R0
0x0004 00004 (test.go:7) MOVHU R0, R0
0x0008 00008 (test.go:7) ORR $65536, R0, R0
0x000c 00012 (test.go:7) RBIT R0, R0
0x0010 00016 (test.go:7) CLZ R0, R0
0x0014 00020 (test.go:7) MOVD R0, "".z(SB)
0x0020 00032 (test.go:7) RET (R30)
This line of code is unnecessary:
0x0004 00004 (test.go:7) MOVHU R0, R0
After:
"".f16 STEXT size=32 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0000 00000 (test.go:7) FUNCDATA ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) PCDATA $2, ZR
0x0000 00000 (test.go:7) PCDATA ZR, ZR
0x0000 00000 (test.go:7) MOVD "".x(FP), R0
0x0004 00004 (test.go:7) ORR $65536, R0, R0
0x0008 00008 (test.go:7) RBITW R0, R0
0x000c 00012 (test.go:7) CLZW R0, R0
0x0010 00016 (test.go:7) MOVD R0, "".z(SB)
0x001c 00028 (test.go:7) RET (R30)
The situation of TrailingZeros8 is similar to TrailingZeros16.
Change-Id: I473bdca06be8460a0be87abbae6fe640017e4c9d
Reviewed-on: https://go-review.googlesource.com/c/go/+/156999
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-01-03 02:25:06 -07:00
|
|
|
// arm64:"RBITW","CLZW",-"ORR"
|
2018-04-25 12:52:06 -06:00
|
|
|
i += bits.TrailingZeros16(n)
|
|
|
|
n &= n - 1
|
|
|
|
}
|
|
|
|
return i
|
|
|
|
}
|
|
|
|
|
|
|
|
func IterateBits8(n uint8) int {
|
|
|
|
i := 0
|
|
|
|
for n != 0 {
|
2021-09-29 19:57:04 -06:00
|
|
|
// amd64/v1,amd64/v2:"BSFL",-"BTSL"
|
|
|
|
// amd64/v3:"TZCNTL"
|
cmd/compile: eliminate unnecessary type conversions in TrailingZeros(16|8) for arm64
This CL eliminates unnecessary type conversion operations: OpZeroExt16to64 and OpZeroExt8to64.
If the input argrument is a nonzero value, then ORconst operation can also be eliminated.
Benchmarks:
name old time/op new time/op delta
TrailingZeros-8 2.75ns ± 0% 2.75ns ± 0% ~ (all equal)
TrailingZeros8-8 3.49ns ± 1% 2.93ns ± 0% -16.00% (p=0.000 n=10+10)
TrailingZeros16-8 3.49ns ± 1% 2.93ns ± 0% -16.05% (p=0.000 n=9+10)
TrailingZeros32-8 2.67ns ± 1% 2.68ns ± 1% ~ (p=0.468 n=10+10)
TrailingZeros64-8 2.67ns ± 1% 2.65ns ± 0% -0.62% (p=0.022 n=10+9)
code:
func f16(x uint) { z = bits.TrailingZeros16(uint16(x)) }
Before:
"".f16 STEXT size=48 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0000 00000 (test.go:7) FUNCDATA ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) PCDATA $2, ZR
0x0000 00000 (test.go:7) PCDATA ZR, ZR
0x0000 00000 (test.go:7) MOVD "".x(FP), R0
0x0004 00004 (test.go:7) MOVHU R0, R0
0x0008 00008 (test.go:7) ORR $65536, R0, R0
0x000c 00012 (test.go:7) RBIT R0, R0
0x0010 00016 (test.go:7) CLZ R0, R0
0x0014 00020 (test.go:7) MOVD R0, "".z(SB)
0x0020 00032 (test.go:7) RET (R30)
This line of code is unnecessary:
0x0004 00004 (test.go:7) MOVHU R0, R0
After:
"".f16 STEXT size=32 args=0x8 locals=0x0 leaf
0x0000 00000 (test.go:7) TEXT "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
0x0000 00000 (test.go:7) FUNCDATA ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) FUNCDATA $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x0000 00000 (test.go:7) PCDATA $2, ZR
0x0000 00000 (test.go:7) PCDATA ZR, ZR
0x0000 00000 (test.go:7) MOVD "".x(FP), R0
0x0004 00004 (test.go:7) ORR $65536, R0, R0
0x0008 00008 (test.go:7) RBITW R0, R0
0x000c 00012 (test.go:7) CLZW R0, R0
0x0010 00016 (test.go:7) MOVD R0, "".z(SB)
0x001c 00028 (test.go:7) RET (R30)
The situation of TrailingZeros8 is similar to TrailingZeros16.
Change-Id: I473bdca06be8460a0be87abbae6fe640017e4c9d
Reviewed-on: https://go-review.googlesource.com/c/go/+/156999
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-01-03 02:25:06 -07:00
|
|
|
// arm64:"RBITW","CLZW",-"ORR"
|
2018-04-25 12:52:06 -06:00
|
|
|
i += bits.TrailingZeros8(n)
|
|
|
|
n &= n - 1
|
|
|
|
}
|
|
|
|
return i
|
|
|
|
}
|
2018-08-14 16:41:22 -06:00
|
|
|
|
2018-10-23 15:05:38 -06:00
|
|
|
// --------------- //
|
|
|
|
// bits.Add* //
|
|
|
|
// --------------- //
|
|
|
|
|
|
|
|
func Add(x, y, ci uint) (r, co uint) {
|
2019-01-14 02:36:18 -07:00
|
|
|
// arm64:"ADDS","ADCS","ADC",-"ADD\t",-"CMP"
|
2018-10-23 15:05:38 -06:00
|
|
|
// amd64:"NEGL","ADCQ","SBBQ","NEGQ"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x: "ADDC", "ADDE", "ADDZE"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"ADDE","ADDC\t[$]-1,"
|
2022-07-29 00:24:26 -06:00
|
|
|
// riscv64: "ADD","SLTU"
|
2018-10-23 15:05:38 -06:00
|
|
|
return bits.Add(x, y, ci)
|
|
|
|
}
|
|
|
|
|
|
|
|
func AddC(x, ci uint) (r, co uint) {
|
2019-01-14 02:36:18 -07:00
|
|
|
// arm64:"ADDS","ADCS","ADC",-"ADD\t",-"CMP"
|
2018-10-23 15:05:38 -06:00
|
|
|
// amd64:"NEGL","ADCQ","SBBQ","NEGQ"
|
2022-09-06 08:12:16 -06:00
|
|
|
// loong64: "ADDV", "SGTU"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x: "ADDC", "ADDE", "ADDZE"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"ADDE","ADDC\t[$]-1,"
|
2023-05-25 23:26:51 -06:00
|
|
|
// mips64:"ADDV","SGTU"
|
2022-07-29 00:24:26 -06:00
|
|
|
// riscv64: "ADD","SLTU"
|
2018-10-23 15:05:38 -06:00
|
|
|
return bits.Add(x, 7, ci)
|
|
|
|
}
|
|
|
|
|
|
|
|
func AddZ(x, y uint) (r, co uint) {
|
2019-03-20 21:24:47 -06:00
|
|
|
// arm64:"ADDS","ADC",-"ADCS",-"ADD\t",-"CMP"
|
2018-10-23 15:05:38 -06:00
|
|
|
// amd64:"ADDQ","SBBQ","NEGQ",-"NEGL",-"ADCQ"
|
2022-09-06 08:12:16 -06:00
|
|
|
// loong64: "ADDV", "SGTU"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x: "ADDC", -"ADDE", "ADDZE"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"ADDC",-"ADDC\t[$]-1,"
|
2023-05-25 23:26:51 -06:00
|
|
|
// mips64:"ADDV","SGTU"
|
2022-07-29 00:24:26 -06:00
|
|
|
// riscv64: "ADD","SLTU"
|
2018-10-23 15:05:38 -06:00
|
|
|
return bits.Add(x, y, 0)
|
|
|
|
}
|
|
|
|
|
|
|
|
func AddR(x, y, ci uint) uint {
|
2019-01-14 02:36:18 -07:00
|
|
|
// arm64:"ADDS","ADCS",-"ADD\t",-"CMP"
|
2018-10-23 15:05:38 -06:00
|
|
|
// amd64:"NEGL","ADCQ",-"SBBQ",-"NEGQ"
|
2022-09-06 08:12:16 -06:00
|
|
|
// loong64: "ADDV", -"SGTU"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x: "ADDC", "ADDE", -"ADDZE"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"ADDE","ADDC\t[$]-1,"
|
2023-05-25 23:26:51 -06:00
|
|
|
// mips64:"ADDV",-"SGTU"
|
2022-07-29 00:24:26 -06:00
|
|
|
// riscv64: "ADD",-"SLTU"
|
2018-10-23 15:05:38 -06:00
|
|
|
r, _ := bits.Add(x, y, ci)
|
|
|
|
return r
|
|
|
|
}
|
2019-04-30 10:46:23 -06:00
|
|
|
|
2018-10-23 15:05:38 -06:00
|
|
|
func AddM(p, q, r *[3]uint) {
|
|
|
|
var c uint
|
|
|
|
r[0], c = bits.Add(p[0], q[0], c)
|
2019-01-14 02:36:18 -07:00
|
|
|
// arm64:"ADCS",-"ADD\t",-"CMP"
|
2018-10-23 15:05:38 -06:00
|
|
|
// amd64:"ADCQ",-"NEGL",-"SBBQ",-"NEGQ"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"ADDE",-"ADDC\t[$]-1,"
|
2018-10-23 15:05:38 -06:00
|
|
|
r[1], c = bits.Add(p[1], q[1], c)
|
|
|
|
r[2], c = bits.Add(p[2], q[2], c)
|
|
|
|
}
|
|
|
|
|
|
|
|
func Add64(x, y, ci uint64) (r, co uint64) {
|
2019-01-14 02:36:18 -07:00
|
|
|
// arm64:"ADDS","ADCS","ADC",-"ADD\t",-"CMP"
|
2018-10-23 15:05:38 -06:00
|
|
|
// amd64:"NEGL","ADCQ","SBBQ","NEGQ"
|
2022-09-06 08:12:16 -06:00
|
|
|
// loong64: "ADDV", "SGTU"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x: "ADDC", "ADDE", "ADDZE"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"ADDE","ADDC\t[$]-1,"
|
2023-05-25 23:26:51 -06:00
|
|
|
// mips64:"ADDV","SGTU"
|
2022-07-29 00:24:26 -06:00
|
|
|
// riscv64: "ADD","SLTU"
|
2018-10-23 15:05:38 -06:00
|
|
|
return bits.Add64(x, y, ci)
|
|
|
|
}
|
|
|
|
|
|
|
|
func Add64C(x, ci uint64) (r, co uint64) {
|
2019-01-14 02:36:18 -07:00
|
|
|
// arm64:"ADDS","ADCS","ADC",-"ADD\t",-"CMP"
|
2018-10-23 15:05:38 -06:00
|
|
|
// amd64:"NEGL","ADCQ","SBBQ","NEGQ"
|
2022-09-06 08:12:16 -06:00
|
|
|
// loong64: "ADDV", "SGTU"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x: "ADDC", "ADDE", "ADDZE"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"ADDE","ADDC\t[$]-1,"
|
2023-05-25 23:26:51 -06:00
|
|
|
// mips64:"ADDV","SGTU"
|
2022-07-29 00:24:26 -06:00
|
|
|
// riscv64: "ADD","SLTU"
|
2018-10-23 15:05:38 -06:00
|
|
|
return bits.Add64(x, 7, ci)
|
|
|
|
}
|
|
|
|
|
|
|
|
func Add64Z(x, y uint64) (r, co uint64) {
|
2019-03-20 21:24:47 -06:00
|
|
|
// arm64:"ADDS","ADC",-"ADCS",-"ADD\t",-"CMP"
|
2018-10-23 15:05:38 -06:00
|
|
|
// amd64:"ADDQ","SBBQ","NEGQ",-"NEGL",-"ADCQ"
|
2022-09-06 08:12:16 -06:00
|
|
|
// loong64: "ADDV", "SGTU"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x: "ADDC", -"ADDE", "ADDZE"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"ADDC",-"ADDC\t[$]-1,"
|
2023-05-25 23:26:51 -06:00
|
|
|
// mips64:"ADDV","SGTU"
|
2022-07-29 00:24:26 -06:00
|
|
|
// riscv64: "ADD","SLTU"
|
2018-10-23 15:05:38 -06:00
|
|
|
return bits.Add64(x, y, 0)
|
|
|
|
}
|
|
|
|
|
|
|
|
func Add64R(x, y, ci uint64) uint64 {
|
2019-01-14 02:36:18 -07:00
|
|
|
// arm64:"ADDS","ADCS",-"ADD\t",-"CMP"
|
2018-10-23 15:05:38 -06:00
|
|
|
// amd64:"NEGL","ADCQ",-"SBBQ",-"NEGQ"
|
2022-09-06 08:12:16 -06:00
|
|
|
// loong64: "ADDV", -"SGTU"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x: "ADDC", "ADDE", -"ADDZE"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"ADDE","ADDC\t[$]-1,"
|
2023-05-25 23:26:51 -06:00
|
|
|
// mips64:"ADDV",-"SGTU"
|
2022-07-29 00:24:26 -06:00
|
|
|
// riscv64: "ADD",-"SLTU"
|
2018-10-23 15:05:38 -06:00
|
|
|
r, _ := bits.Add64(x, y, ci)
|
|
|
|
return r
|
|
|
|
}
|
2024-03-07 14:37:14 -07:00
|
|
|
|
2018-10-23 15:05:38 -06:00
|
|
|
func Add64M(p, q, r *[3]uint64) {
|
|
|
|
var c uint64
|
|
|
|
r[0], c = bits.Add64(p[0], q[0], c)
|
2019-01-14 02:36:18 -07:00
|
|
|
// arm64:"ADCS",-"ADD\t",-"CMP"
|
2018-10-23 15:05:38 -06:00
|
|
|
// amd64:"ADCQ",-"NEGL",-"SBBQ",-"NEGQ"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x: -"ADDC", "ADDE", -"ADDZE"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"ADDE",-"ADDC\t[$]-1,"
|
2018-10-23 15:05:38 -06:00
|
|
|
r[1], c = bits.Add64(p[1], q[1], c)
|
|
|
|
r[2], c = bits.Add64(p[2], q[2], c)
|
|
|
|
}
|
|
|
|
|
2024-03-07 14:37:14 -07:00
|
|
|
func Add64M0(p, q, r *[3]uint64) {
|
|
|
|
var c uint64
|
|
|
|
r[0], c = bits.Add64(p[0], q[0], 0)
|
|
|
|
// ppc64x: -"ADDC", -"ADDE", "ADDZE\tR[1-9]"
|
|
|
|
r[1], c = bits.Add64(p[1], 0, c)
|
|
|
|
// ppc64x: -"ADDC", "ADDE", -"ADDZE"
|
|
|
|
r[2], c = bits.Add64(p[2], p[2], c)
|
|
|
|
}
|
|
|
|
|
2021-06-08 12:16:01 -06:00
|
|
|
func Add64MSaveC(p, q, r, c *[2]uint64) {
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x: "ADDC\tR", "ADDZE"
|
2021-06-08 12:16:01 -06:00
|
|
|
r[0], c[0] = bits.Add64(p[0], q[0], 0)
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x: "ADDC\t[$]-1", "ADDE", "ADDZE"
|
2021-06-08 12:16:01 -06:00
|
|
|
r[1], c[1] = bits.Add64(p[1], q[1], c[0])
|
|
|
|
}
|
|
|
|
|
2020-02-17 04:43:33 -07:00
|
|
|
func Add64PanicOnOverflowEQ(a, b uint64) uint64 {
|
|
|
|
r, c := bits.Add64(a, b, 0)
|
|
|
|
// s390x:"BRC\t[$]3,",-"ADDE"
|
|
|
|
if c == 1 {
|
|
|
|
panic("overflow")
|
|
|
|
}
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
|
|
|
func Add64PanicOnOverflowNE(a, b uint64) uint64 {
|
|
|
|
r, c := bits.Add64(a, b, 0)
|
|
|
|
// s390x:"BRC\t[$]3,",-"ADDE"
|
|
|
|
if c != 0 {
|
|
|
|
panic("overflow")
|
|
|
|
}
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
|
|
|
func Add64PanicOnOverflowGT(a, b uint64) uint64 {
|
|
|
|
r, c := bits.Add64(a, b, 0)
|
|
|
|
// s390x:"BRC\t[$]3,",-"ADDE"
|
|
|
|
if c > 0 {
|
|
|
|
panic("overflow")
|
|
|
|
}
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
|
|
|
func Add64MPanicOnOverflowEQ(a, b [2]uint64) [2]uint64 {
|
|
|
|
var r [2]uint64
|
|
|
|
var c uint64
|
|
|
|
r[0], c = bits.Add64(a[0], b[0], c)
|
|
|
|
r[1], c = bits.Add64(a[1], b[1], c)
|
|
|
|
// s390x:"BRC\t[$]3,"
|
|
|
|
if c == 1 {
|
|
|
|
panic("overflow")
|
|
|
|
}
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
|
|
|
func Add64MPanicOnOverflowNE(a, b [2]uint64) [2]uint64 {
|
|
|
|
var r [2]uint64
|
|
|
|
var c uint64
|
|
|
|
r[0], c = bits.Add64(a[0], b[0], c)
|
|
|
|
r[1], c = bits.Add64(a[1], b[1], c)
|
|
|
|
// s390x:"BRC\t[$]3,"
|
|
|
|
if c != 0 {
|
|
|
|
panic("overflow")
|
|
|
|
}
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
|
|
|
func Add64MPanicOnOverflowGT(a, b [2]uint64) [2]uint64 {
|
|
|
|
var r [2]uint64
|
|
|
|
var c uint64
|
|
|
|
r[0], c = bits.Add64(a[0], b[0], c)
|
|
|
|
r[1], c = bits.Add64(a[1], b[1], c)
|
|
|
|
// s390x:"BRC\t[$]3,"
|
|
|
|
if c > 0 {
|
|
|
|
panic("overflow")
|
|
|
|
}
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
2022-10-28 14:59:43 -06:00
|
|
|
// Verify independent carry chain operations are scheduled efficiently
|
|
|
|
// and do not cause unnecessary save/restore of the CA bit.
|
|
|
|
//
|
|
|
|
// This is an example of why CarryChainTail priority must be lower
|
|
|
|
// (earlier in the block) than Memory. f[0]=f1 could be scheduled
|
|
|
|
// after the first two lower 64 bit limb adds, but before either
|
|
|
|
// high 64 bit limbs are added.
|
|
|
|
//
|
|
|
|
// This is what happened on PPC64 when compiling
|
|
|
|
// crypto/internal/edwards25519/field.feMulGeneric.
|
|
|
|
func Add64MultipleChains(a, b, c, d [2]uint64) {
|
|
|
|
var cx, d1, d2 uint64
|
|
|
|
a1, a2 := a[0], a[1]
|
|
|
|
b1, b2 := b[0], b[1]
|
|
|
|
c1, c2 := c[0], c[1]
|
|
|
|
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x: "ADDC\tR\\d+,", -"ADDE", -"MOVD\tXER"
|
2022-10-28 14:59:43 -06:00
|
|
|
d1, cx = bits.Add64(a1, b1, 0)
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x: "ADDE", -"ADDC", -"MOVD\t.*, XER"
|
2022-10-28 14:59:43 -06:00
|
|
|
d2, _ = bits.Add64(a2, b2, cx)
|
|
|
|
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x: "ADDC\tR\\d+,", -"ADDE", -"MOVD\tXER"
|
2022-10-28 14:59:43 -06:00
|
|
|
d1, cx = bits.Add64(c1, d1, 0)
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x: "ADDE", -"ADDC", -"MOVD\t.*, XER"
|
2022-10-28 14:59:43 -06:00
|
|
|
d2, _ = bits.Add64(c2, d2, cx)
|
|
|
|
d[0] = d1
|
|
|
|
d[1] = d2
|
|
|
|
}
|
|
|
|
|
2018-10-23 15:38:22 -06:00
|
|
|
// --------------- //
|
|
|
|
// bits.Sub* //
|
|
|
|
// --------------- //
|
|
|
|
|
|
|
|
func Sub(x, y, ci uint) (r, co uint) {
|
|
|
|
// amd64:"NEGL","SBBQ","NEGQ"
|
2019-03-20 06:46:20 -06:00
|
|
|
// arm64:"NEGS","SBCS","NGC","NEG",-"ADD",-"SUB",-"CMP"
|
2022-09-06 08:29:31 -06:00
|
|
|
// loong64:"SUBV","SGTU"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"SUBC", "SUBE", "SUBZE", "NEG"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"SUBE"
|
2023-05-25 23:33:45 -06:00
|
|
|
// mips64:"SUBV","SGTU"
|
2022-07-29 08:14:53 -06:00
|
|
|
// riscv64: "SUB","SLTU"
|
2018-10-23 15:38:22 -06:00
|
|
|
return bits.Sub(x, y, ci)
|
|
|
|
}
|
|
|
|
|
|
|
|
func SubC(x, ci uint) (r, co uint) {
|
|
|
|
// amd64:"NEGL","SBBQ","NEGQ"
|
2019-03-20 06:46:20 -06:00
|
|
|
// arm64:"NEGS","SBCS","NGC","NEG",-"ADD",-"SUB",-"CMP"
|
2022-09-06 08:29:31 -06:00
|
|
|
// loong64:"SUBV","SGTU"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"SUBC", "SUBE", "SUBZE", "NEG"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"SUBE"
|
2023-05-25 23:33:45 -06:00
|
|
|
// mips64:"SUBV","SGTU"
|
2022-07-29 08:14:53 -06:00
|
|
|
// riscv64: "SUB","SLTU"
|
2018-10-23 15:38:22 -06:00
|
|
|
return bits.Sub(x, 7, ci)
|
|
|
|
}
|
|
|
|
|
|
|
|
func SubZ(x, y uint) (r, co uint) {
|
|
|
|
// amd64:"SUBQ","SBBQ","NEGQ",-"NEGL"
|
2019-03-20 06:46:20 -06:00
|
|
|
// arm64:"SUBS","NGC","NEG",-"SBCS",-"ADD",-"SUB\t",-"CMP"
|
2022-09-06 08:29:31 -06:00
|
|
|
// loong64:"SUBV","SGTU"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"SUBC", -"SUBE", "SUBZE", "NEG"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"SUBC"
|
2023-05-25 23:33:45 -06:00
|
|
|
// mips64:"SUBV","SGTU"
|
2022-07-29 08:14:53 -06:00
|
|
|
// riscv64: "SUB","SLTU"
|
2018-10-23 15:38:22 -06:00
|
|
|
return bits.Sub(x, y, 0)
|
|
|
|
}
|
|
|
|
|
|
|
|
func SubR(x, y, ci uint) uint {
|
|
|
|
// amd64:"NEGL","SBBQ",-"NEGQ"
|
2019-03-20 06:46:20 -06:00
|
|
|
// arm64:"NEGS","SBCS",-"NGC",-"NEG\t",-"ADD",-"SUB",-"CMP"
|
2022-09-06 08:29:31 -06:00
|
|
|
// loong64:"SUBV",-"SGTU"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"SUBC", "SUBE", -"SUBZE", -"NEG"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"SUBE"
|
2022-07-29 08:14:53 -06:00
|
|
|
// riscv64: "SUB",-"SLTU"
|
2018-10-23 15:38:22 -06:00
|
|
|
r, _ := bits.Sub(x, y, ci)
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
func SubM(p, q, r *[3]uint) {
|
|
|
|
var c uint
|
|
|
|
r[0], c = bits.Sub(p[0], q[0], c)
|
|
|
|
// amd64:"SBBQ",-"NEGL",-"NEGQ"
|
2019-03-20 06:46:20 -06:00
|
|
|
// arm64:"SBCS",-"NEGS",-"NGC",-"NEG",-"ADD",-"SUB",-"CMP"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:-"SUBC", "SUBE", -"SUBZE", -"NEG"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"SUBE"
|
2018-10-23 15:38:22 -06:00
|
|
|
r[1], c = bits.Sub(p[1], q[1], c)
|
|
|
|
r[2], c = bits.Sub(p[2], q[2], c)
|
|
|
|
}
|
|
|
|
|
|
|
|
func Sub64(x, y, ci uint64) (r, co uint64) {
|
|
|
|
// amd64:"NEGL","SBBQ","NEGQ"
|
2019-03-20 06:46:20 -06:00
|
|
|
// arm64:"NEGS","SBCS","NGC","NEG",-"ADD",-"SUB",-"CMP"
|
2022-09-06 08:29:31 -06:00
|
|
|
// loong64:"SUBV","SGTU"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"SUBC", "SUBE", "SUBZE", "NEG"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"SUBE"
|
2023-05-25 23:33:45 -06:00
|
|
|
// mips64:"SUBV","SGTU"
|
2022-07-29 08:14:53 -06:00
|
|
|
// riscv64: "SUB","SLTU"
|
2018-10-23 15:38:22 -06:00
|
|
|
return bits.Sub64(x, y, ci)
|
|
|
|
}
|
|
|
|
|
|
|
|
func Sub64C(x, ci uint64) (r, co uint64) {
|
|
|
|
// amd64:"NEGL","SBBQ","NEGQ"
|
2019-03-20 06:46:20 -06:00
|
|
|
// arm64:"NEGS","SBCS","NGC","NEG",-"ADD",-"SUB",-"CMP"
|
2022-09-06 08:29:31 -06:00
|
|
|
// loong64:"SUBV","SGTU"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"SUBC", "SUBE", "SUBZE", "NEG"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"SUBE"
|
2023-05-25 23:33:45 -06:00
|
|
|
// mips64:"SUBV","SGTU"
|
2022-07-29 08:14:53 -06:00
|
|
|
// riscv64: "SUB","SLTU"
|
2018-10-23 15:38:22 -06:00
|
|
|
return bits.Sub64(x, 7, ci)
|
|
|
|
}
|
|
|
|
|
|
|
|
func Sub64Z(x, y uint64) (r, co uint64) {
|
|
|
|
// amd64:"SUBQ","SBBQ","NEGQ",-"NEGL"
|
2019-03-20 06:46:20 -06:00
|
|
|
// arm64:"SUBS","NGC","NEG",-"SBCS",-"ADD",-"SUB\t",-"CMP"
|
2022-09-06 08:29:31 -06:00
|
|
|
// loong64:"SUBV","SGTU"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"SUBC", -"SUBE", "SUBZE", "NEG"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"SUBC"
|
2023-05-25 23:33:45 -06:00
|
|
|
// mips64:"SUBV","SGTU"
|
2022-07-29 08:14:53 -06:00
|
|
|
// riscv64: "SUB","SLTU"
|
2018-10-23 15:38:22 -06:00
|
|
|
return bits.Sub64(x, y, 0)
|
|
|
|
}
|
|
|
|
|
|
|
|
func Sub64R(x, y, ci uint64) uint64 {
|
|
|
|
// amd64:"NEGL","SBBQ",-"NEGQ"
|
2019-03-20 06:46:20 -06:00
|
|
|
// arm64:"NEGS","SBCS",-"NGC",-"NEG\t",-"ADD",-"SUB",-"CMP"
|
2022-09-06 08:29:31 -06:00
|
|
|
// loong64:"SUBV",-"SGTU"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"SUBC", "SUBE", -"SUBZE", -"NEG"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"SUBE"
|
2022-07-29 08:14:53 -06:00
|
|
|
// riscv64: "SUB",-"SLTU"
|
2018-10-23 15:38:22 -06:00
|
|
|
r, _ := bits.Sub64(x, y, ci)
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
func Sub64M(p, q, r *[3]uint64) {
|
|
|
|
var c uint64
|
|
|
|
r[0], c = bits.Sub64(p[0], q[0], c)
|
|
|
|
// amd64:"SBBQ",-"NEGL",-"NEGQ"
|
2019-03-20 06:46:20 -06:00
|
|
|
// arm64:"SBCS",-"NEGS",-"NGC",-"NEG",-"ADD",-"SUB",-"CMP"
|
2019-04-30 10:46:23 -06:00
|
|
|
// s390x:"SUBE"
|
2018-10-23 15:38:22 -06:00
|
|
|
r[1], c = bits.Sub64(p[1], q[1], c)
|
|
|
|
r[2], c = bits.Sub64(p[2], q[2], c)
|
|
|
|
}
|
|
|
|
|
2021-06-08 12:16:01 -06:00
|
|
|
func Sub64MSaveC(p, q, r, c *[2]uint64) {
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"SUBC\tR\\d+, R\\d+,", "SUBZE", "NEG"
|
2021-06-08 12:16:01 -06:00
|
|
|
r[0], c[0] = bits.Sub64(p[0], q[0], 0)
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"SUBC\tR\\d+, [$]0,", "SUBE", "SUBZE", "NEG"
|
2021-06-08 12:16:01 -06:00
|
|
|
r[1], c[1] = bits.Sub64(p[1], q[1], c[0])
|
|
|
|
}
|
|
|
|
|
2020-02-17 04:43:33 -07:00
|
|
|
func Sub64PanicOnOverflowEQ(a, b uint64) uint64 {
|
|
|
|
r, b := bits.Sub64(a, b, 0)
|
|
|
|
// s390x:"BRC\t[$]12,",-"ADDE",-"SUBE"
|
|
|
|
if b == 1 {
|
|
|
|
panic("overflow")
|
|
|
|
}
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
|
|
|
func Sub64PanicOnOverflowNE(a, b uint64) uint64 {
|
|
|
|
r, b := bits.Sub64(a, b, 0)
|
|
|
|
// s390x:"BRC\t[$]12,",-"ADDE",-"SUBE"
|
|
|
|
if b != 0 {
|
|
|
|
panic("overflow")
|
|
|
|
}
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
|
|
|
func Sub64PanicOnOverflowGT(a, b uint64) uint64 {
|
|
|
|
r, b := bits.Sub64(a, b, 0)
|
|
|
|
// s390x:"BRC\t[$]12,",-"ADDE",-"SUBE"
|
|
|
|
if b > 0 {
|
|
|
|
panic("overflow")
|
|
|
|
}
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
|
|
|
func Sub64MPanicOnOverflowEQ(a, b [2]uint64) [2]uint64 {
|
|
|
|
var r [2]uint64
|
|
|
|
var c uint64
|
|
|
|
r[0], c = bits.Sub64(a[0], b[0], c)
|
|
|
|
r[1], c = bits.Sub64(a[1], b[1], c)
|
|
|
|
// s390x:"BRC\t[$]12,"
|
|
|
|
if c == 1 {
|
|
|
|
panic("overflow")
|
|
|
|
}
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
|
|
|
func Sub64MPanicOnOverflowNE(a, b [2]uint64) [2]uint64 {
|
|
|
|
var r [2]uint64
|
|
|
|
var c uint64
|
|
|
|
r[0], c = bits.Sub64(a[0], b[0], c)
|
|
|
|
r[1], c = bits.Sub64(a[1], b[1], c)
|
|
|
|
// s390x:"BRC\t[$]12,"
|
|
|
|
if c != 0 {
|
|
|
|
panic("overflow")
|
|
|
|
}
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
|
|
|
func Sub64MPanicOnOverflowGT(a, b [2]uint64) [2]uint64 {
|
|
|
|
var r [2]uint64
|
|
|
|
var c uint64
|
|
|
|
r[0], c = bits.Sub64(a[0], b[0], c)
|
|
|
|
r[1], c = bits.Sub64(a[1], b[1], c)
|
|
|
|
// s390x:"BRC\t[$]12,"
|
|
|
|
if c > 0 {
|
|
|
|
panic("overflow")
|
|
|
|
}
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
2018-08-14 16:41:22 -06:00
|
|
|
// --------------- //
|
|
|
|
// bits.Mul* //
|
|
|
|
// --------------- //
|
|
|
|
|
|
|
|
func Mul(x, y uint) (hi, lo uint) {
|
|
|
|
// amd64:"MULQ"
|
|
|
|
// arm64:"UMULH","MUL"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"MULHDU","MULLD"
|
2019-09-08 16:50:24 -06:00
|
|
|
// s390x:"MLGR"
|
2019-10-13 04:51:49 -06:00
|
|
|
// mips64: "MULVU"
|
2024-05-22 01:55:04 -06:00
|
|
|
// riscv64:"MULHU","MUL"
|
2018-08-14 16:41:22 -06:00
|
|
|
return bits.Mul(x, y)
|
|
|
|
}
|
|
|
|
|
|
|
|
func Mul64(x, y uint64) (hi, lo uint64) {
|
|
|
|
// amd64:"MULQ"
|
|
|
|
// arm64:"UMULH","MUL"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"MULHDU","MULLD"
|
2019-09-08 16:50:24 -06:00
|
|
|
// s390x:"MLGR"
|
2019-10-13 04:51:49 -06:00
|
|
|
// mips64: "MULVU"
|
2021-06-22 05:20:03 -06:00
|
|
|
// riscv64:"MULHU","MUL"
|
2018-08-14 16:41:22 -06:00
|
|
|
return bits.Mul64(x, y)
|
|
|
|
}
|
2018-10-23 20:54:56 -06:00
|
|
|
|
2022-08-22 15:00:17 -06:00
|
|
|
func Mul64HiOnly(x, y uint64) uint64 {
|
|
|
|
// arm64:"UMULH",-"MUL"
|
2022-08-24 08:17:51 -06:00
|
|
|
// riscv64:"MULHU",-"MUL\t"
|
2022-08-22 15:00:17 -06:00
|
|
|
hi, _ := bits.Mul64(x, y)
|
|
|
|
return hi
|
|
|
|
}
|
|
|
|
|
|
|
|
func Mul64LoOnly(x, y uint64) uint64 {
|
|
|
|
// arm64:"MUL",-"UMULH"
|
2022-08-24 08:17:51 -06:00
|
|
|
// riscv64:"MUL\t",-"MULHU"
|
2022-08-22 15:00:17 -06:00
|
|
|
_, lo := bits.Mul64(x, y)
|
|
|
|
return lo
|
|
|
|
}
|
|
|
|
|
2018-10-23 20:54:56 -06:00
|
|
|
// --------------- //
|
|
|
|
// bits.Div* //
|
|
|
|
// --------------- //
|
|
|
|
|
|
|
|
func Div(hi, lo, x uint) (q, r uint) {
|
|
|
|
// amd64:"DIVQ"
|
|
|
|
return bits.Div(hi, lo, x)
|
|
|
|
}
|
|
|
|
|
cmd/compile: optimize math/bits.Div32 for arm64
Benchmark:
name old time/op new time/op delta
Div-8 22.0ns ± 0% 22.0ns ± 0% ~ (all equal)
Div32-8 6.51ns ± 0% 3.00ns ± 0% -53.90% (p=0.000 n=10+8)
Div64-8 22.5ns ± 0% 22.5ns ± 0% ~ (all equal)
Code:
func div32(hi, lo, y uint32) (q, r uint32) {return bits.Div32(hi, lo, y)}
Before:
0x0020 00032 (test.go:24) MOVWU "".y+8(FP), R0
0x0024 00036 ($GOROOT/src/math/bits/bits.go:472) CBZW R0, 132
0x0028 00040 ($GOROOT/src/math/bits/bits.go:472) MOVWU "".hi(FP), R1
0x002c 00044 ($GOROOT/src/math/bits/bits.go:472) CMPW R1, R0
0x0030 00048 ($GOROOT/src/math/bits/bits.go:472) BLS 96
0x0034 00052 ($GOROOT/src/math/bits/bits.go:475) MOVWU "".lo+4(FP), R2
0x0038 00056 ($GOROOT/src/math/bits/bits.go:475) ORR R1<<32, R2, R1
0x003c 00060 ($GOROOT/src/math/bits/bits.go:476) CBZ R0, 140
0x0040 00064 ($GOROOT/src/math/bits/bits.go:476) UDIV R0, R1, R2
0x0044 00068 (test.go:24) MOVW R2, "".q+16(FP)
0x0048 00072 ($GOROOT/src/math/bits/bits.go:476) UREM R0, R1, R0
0x0050 00080 (test.go:24) MOVW R0, "".r+20(FP)
0x0054 00084 (test.go:24) MOVD -8(RSP), R29
0x0058 00088 (test.go:24) MOVD.P 32(RSP), R30
0x005c 00092 (test.go:24) RET (R30)
After:
0x001c 00028 (test.go:24) MOVWU "".y+8(FP), R0
0x0020 00032 (test.go:24) CBZW R0, 92
0x0024 00036 (test.go:24) MOVWU "".hi(FP), R1
0x0028 00040 (test.go:24) CMPW R0, R1
0x002c 00044 (test.go:24) BHS 84
0x0030 00048 (test.go:24) MOVWU "".lo+4(FP), R2
0x0034 00052 (test.go:24) ORR R1<<32, R2, R4
0x0038 00056 (test.go:24) UDIV R0, R4, R3
0x003c 00060 (test.go:24) MSUB R3, R4, R0, R4
0x0040 00064 (test.go:24) MOVW R3, "".q+16(FP)
0x0044 00068 (test.go:24) MOVW R4, "".r+20(FP)
0x0048 00072 (test.go:24) MOVD -8(RSP), R29
0x004c 00076 (test.go:24) MOVD.P 16(RSP), R30
0x0050 00080 (test.go:24) RET (R30)
UREM instruction in the previous assembly code will be converted to UDIV and MSUB instructions
on arm64. However the UDIV instruction in UREM is unnecessary, because it's a duplicate of the
previous UDIV. This CL adds a rule to have this extra UDIV instruction removed by CSE.
Change-Id: Ie2508784320020b2de022806d09f75a7871bb3d7
Reviewed-on: https://go-review.googlesource.com/c/159577
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Bryan C. Mills <bcmills@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-01-22 02:10:59 -07:00
|
|
|
func Div32(hi, lo, x uint32) (q, r uint32) {
|
|
|
|
// arm64:"ORR","UDIV","MSUB",-"UREM"
|
|
|
|
return bits.Div32(hi, lo, x)
|
|
|
|
}
|
|
|
|
|
2018-10-23 20:54:56 -06:00
|
|
|
func Div64(hi, lo, x uint64) (q, r uint64) {
|
|
|
|
// amd64:"DIVQ"
|
|
|
|
return bits.Div64(hi, lo, x)
|
|
|
|
}
|
2019-04-20 12:09:34 -06:00
|
|
|
|
|
|
|
func Div64degenerate(x uint64) (q, r uint64) {
|
|
|
|
// amd64:-"DIVQ"
|
|
|
|
return bits.Div64(0, x, 5)
|
|
|
|
}
|