2018-04-30 06:27:50 -06:00
|
|
|
// asmcheck
|
|
|
|
|
|
|
|
// Copyright 2018 The Go Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
|
|
|
|
package codegen
|
|
|
|
|
2021-09-21 10:46:00 -06:00
|
|
|
// ------------------ //
|
|
|
|
// constant shifts //
|
|
|
|
// ------------------ //
|
|
|
|
|
|
|
|
func lshConst64x64(v int64) int64 {
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"SLD"
|
2022-09-01 04:36:34 -06:00
|
|
|
// riscv64:"SLLI",-"AND",-"SLTIU"
|
2021-09-21 10:46:00 -06:00
|
|
|
return v << uint64(33)
|
|
|
|
}
|
|
|
|
|
|
|
|
func rshConst64Ux64(v uint64) uint64 {
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"SRD"
|
cmd/compile: optimize right shifts of uint32 on riscv
The compiler is currently zero extending 32 bit unsigned integers to
64 bits before right shifting them using a 64 bit shift instruction.
There's no need to do this as RISC-V has instructions for right
shifting 32 bit unsigned values (srlw and srliw) which zero extend
the result of the shift to 64 bits. Change the compiler so that
it uses srlw and srliw for 32 bit unsigned shifts reducing in most
cases the number of instructions needed to perform the shift.
Here are some examples of code sequences that are changed by this
patch:
uint32(a) >> 2
before:
sll x5,x10,0x20
srl x10,x5,0x22
after:
srlw x10,x10,0x2
uint32(a) >> int(b)
before:
sll x5,x10,0x20
srl x5,x5,0x20
srl x5,x5,x11
sltiu x6,x11,64
neg x6,x6
and x10,x5,x6
after:
srlw x5,x10,x11
sltiu x6,x11,32
neg x6,x6
and x10,x5,x6
bits.RotateLeft32(uint32(a), 1)
before:
sll x5,x10,0x1
sll x6,x10,0x20
srl x7,x6,0x3f
or x5,x5,x7
after:
sll x5,x10,0x1
srlw x6,x10,0x1f
or x10,x5,x6
bits.RotateLeft32(uint32(a), int(b))
before:
and x6,x11,31
sll x7,x10,x6
sll x8,x10,0x20
srl x8,x8,0x20
add x6,x6,-32
neg x6,x6
srl x9,x8,x6
sltiu x6,x6,64
neg x6,x6
and x6,x9,x6
or x6,x6,x7
after:
and x5,x11,31
sll x6,x10,x5
add x5,x5,-32
neg x5,x5
srlw x7,x10,x5
sltiu x5,x5,32
neg x5,x5
and x5,x7,x5
or x10,x6,x5
The one regression observed is the following case, an unbounded right
shift of a uint32 where the value we're shifting by is known to be
< 64 but > 31. As this is an unusual case this commit does not
optimize for it, although the existing code does.
uint32(a) >> (b & 63)
before:
sll x5,x10,0x20
srl x5,x5,0x20
and x6,x11,63
srl x10,x5,x6
after
and x5,x11,63
srlw x6,x10,x5
sltiu x5,x5,32
neg x5,x5
and x10,x6,x5
Here we have one extra instruction.
Some benchmark highlights, generated on a VisionFive2 8GB running
Ubuntu 23.04.
pkg: math/bits
LeadingZeros32-4 18.64n ± 0% 17.32n ± 0% -7.11% (p=0.000 n=10)
LeadingZeros64-4 15.47n ± 0% 15.51n ± 0% +0.26% (p=0.027 n=10)
TrailingZeros16-4 18.48n ± 0% 17.68n ± 0% -4.33% (p=0.000 n=10)
TrailingZeros32-4 16.87n ± 0% 16.07n ± 0% -4.74% (p=0.000 n=10)
TrailingZeros64-4 15.26n ± 0% 15.27n ± 0% +0.07% (p=0.043 n=10)
OnesCount32-4 20.08n ± 0% 19.29n ± 0% -3.96% (p=0.000 n=10)
RotateLeft-4 8.864n ± 0% 8.838n ± 0% -0.30% (p=0.006 n=10)
RotateLeft32-4 8.837n ± 0% 8.032n ± 0% -9.11% (p=0.000 n=10)
Reverse32-4 29.77n ± 0% 26.52n ± 0% -10.93% (p=0.000 n=10)
ReverseBytes32-4 9.640n ± 0% 8.838n ± 0% -8.32% (p=0.000 n=10)
Sub32-4 8.835n ± 0% 8.035n ± 0% -9.06% (p=0.000 n=10)
geomean 11.50n 11.33n -1.45%
pkg: crypto/md5
Hash8Bytes-4 1.486µ ± 0% 1.426µ ± 0% -4.04% (p=0.000 n=10)
Hash64-4 2.079µ ± 0% 1.968µ ± 0% -5.36% (p=0.000 n=10)
Hash128-4 2.720µ ± 0% 2.557µ ± 0% -5.99% (p=0.000 n=10)
Hash256-4 3.996µ ± 0% 3.733µ ± 0% -6.58% (p=0.000 n=10)
Hash512-4 6.541µ ± 0% 6.072µ ± 0% -7.18% (p=0.000 n=10)
Hash1K-4 11.64µ ± 0% 10.75µ ± 0% -7.58% (p=0.000 n=10)
Hash8K-4 82.95µ ± 0% 76.32µ ± 0% -7.99% (p=0.000 n=10)
Hash1M-4 10.436m ± 0% 9.591m ± 0% -8.10% (p=0.000 n=10)
Hash8M-4 83.50m ± 0% 76.73m ± 0% -8.10% (p=0.000 n=10)
Hash8BytesUnaligned-4 1.494µ ± 0% 1.434µ ± 0% -4.02% (p=0.000 n=10)
Hash1KUnaligned-4 11.64µ ± 0% 10.76µ ± 0% -7.52% (p=0.000 n=10)
Hash8KUnaligned-4 83.01µ ± 0% 76.32µ ± 0% -8.07% (p=0.000 n=10)
geomean 28.32µ 26.42µ -6.72%
Change-Id: I20483a6668cca1b53fe83944bee3706aadcf8693
Reviewed-on: https://go-review.googlesource.com/c/go/+/528975
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Gopher Robot <gobot@golang.org>
2023-09-17 05:08:55 -06:00
|
|
|
// riscv64:"SRLI\t",-"AND",-"SLTIU"
|
2021-09-21 10:46:00 -06:00
|
|
|
return v >> uint64(33)
|
|
|
|
}
|
|
|
|
|
2023-11-25 08:48:05 -07:00
|
|
|
func rshConst64Ux64Overflow32(v uint32) uint64 {
|
|
|
|
// riscv64:"MOV\t\\$0,",-"SRL"
|
|
|
|
return uint64(v) >> 32
|
|
|
|
}
|
|
|
|
|
|
|
|
func rshConst64Ux64Overflow16(v uint16) uint64 {
|
|
|
|
// riscv64:"MOV\t\\$0,",-"SRL"
|
|
|
|
return uint64(v) >> 16
|
|
|
|
}
|
|
|
|
|
|
|
|
func rshConst64Ux64Overflow8(v uint8) uint64 {
|
|
|
|
// riscv64:"MOV\t\\$0,",-"SRL"
|
|
|
|
return uint64(v) >> 8
|
|
|
|
}
|
|
|
|
|
2021-09-21 10:46:00 -06:00
|
|
|
func rshConst64x64(v int64) int64 {
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"SRAD"
|
cmd/compile: optimize right shifts of int32 on riscv64
The compiler is currently sign extending 32 bit signed integers to
64 bits before right shifting them using a 64 bit shift instruction.
There's no need to do this as RISC-V has instructions for right
shifting 32 bit signed values (sraw and sraiw) which sign extend
the result of the shift to 64 bits. Change the compiler so that
it uses sraw and sraiw for shifts of signed 32 bit integers reducing
in most cases the number of instructions needed to perform the shift.
Here are some examples of code sequences that are changed by this
patch:
int32(a) >> 2
before:
sll x5,x10,0x20
sra x10,x5,0x22
after:
sraw x10,x10,0x2
int32(v) >> int(s)
before:
sext.w x5,x10
sltiu x6,x11,64
add x6,x6,-1
or x6,x11,x6
sra x10,x5,x6
after:
sltiu x5,x11,32
add x5,x5,-1
or x5,x11,x5
sraw x10,x10,x5
int32(v) >> (int(s) & 31)
before:
sext.w x5,x10
and x6,x11,63
sra x10,x5,x6
after:
and x5,x11,31
sraw x10,x10,x5
int32(100) >> int(a)
before:
bltz x10,<target address calls runtime.panicshift>
sltiu x5,x10,64
add x5,x5,-1
or x5,x10,x5
li x6,100
sra x10,x6,x5
after:
bltz x10,<target address calls runtime.panicshift>
sltiu x5,x10,32
add x5,x5,-1
or x5,x10,x5
li x6,100
sraw x10,x6,x5
int32(v) >> (int(s) & 63)
before:
sext.w x5,x10
and x6,x11,63
sra x10,x5,x6
after:
and x5,x11,63
sltiu x6,x5,32
add x6,x6,-1
or x5,x5,x6
sraw x10,x10,x5
In most cases we eliminate one instruction. In the case where
we shift a int32 constant by a variable the number of instructions
generated is identical. A sra is simply replaced by a sraw. In the
unusual case where we shift right by a variable anded with a constant
> 31 but < 64, we generate two additional instructions. As this is
an unusual case we do not try to optimize for it.
Some improvements can be seen in some of the existing benchmarks,
notably in the utf8 package which performs right shifts of runes
which are signed 32 bit integers.
| utf8-old | utf8-new |
| sec/op | sec/op vs base |
EncodeASCIIRune-4 17.68n ± 0% 17.67n ± 0% ~ (p=0.312 n=10)
EncodeJapaneseRune-4 35.34n ± 0% 34.53n ± 1% -2.31% (p=0.000 n=10)
AppendASCIIRune-4 3.213n ± 0% 3.213n ± 0% ~ (p=0.318 n=10)
AppendJapaneseRune-4 36.14n ± 0% 35.35n ± 0% -2.19% (p=0.000 n=10)
DecodeASCIIRune-4 28.11n ± 0% 27.36n ± 0% -2.69% (p=0.000 n=10)
DecodeJapaneseRune-4 38.55n ± 0% 38.58n ± 0% ~ (p=0.612 n=10)
Change-Id: I60a91cbede9ce65597571c7b7dd9943eeb8d3cc2
Reviewed-on: https://go-review.googlesource.com/c/go/+/535115
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Joel Sing <joel@sing.id.au>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: M Zhuo <mzh@golangcn.org>
Reviewed-by: David Chase <drchase@google.com>
2023-09-22 07:14:25 -06:00
|
|
|
// riscv64:"SRAI\t",-"OR",-"SLTIU"
|
2021-09-21 10:46:00 -06:00
|
|
|
return v >> uint64(33)
|
|
|
|
}
|
|
|
|
|
2023-11-25 08:48:05 -07:00
|
|
|
func rshConst64x64Overflow32(v int32) int64 {
|
|
|
|
// riscv64:"SRAIW",-"SLLI",-"SRAI\t"
|
|
|
|
return int64(v) >> 32
|
|
|
|
}
|
|
|
|
|
|
|
|
func rshConst64x64Overflow16(v int16) int64 {
|
|
|
|
// riscv64:"SLLI","SRAI",-"SRAIW"
|
|
|
|
return int64(v) >> 16
|
|
|
|
}
|
|
|
|
|
|
|
|
func rshConst64x64Overflow8(v int8) int64 {
|
|
|
|
// riscv64:"SLLI","SRAI",-"SRAIW"
|
|
|
|
return int64(v) >> 8
|
|
|
|
}
|
|
|
|
|
2021-09-21 10:46:00 -06:00
|
|
|
func lshConst32x64(v int32) int32 {
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"SLW"
|
cmd/compile: fold constant shift with extension on riscv64
For example:
movb a0, a0
srai $1, a0, a0
the assembler will expand to:
slli $56, a0, a0
srai $56, a0, a0
srai $1, a0, a0
this CL optimize to:
slli $56, a0, a0
srai $57, a0, a0
Remove 270+ instructions from Go binary on linux/riscv64.
Change-Id: I375e19f9d3bd54f2781791d8cbe5970191297dc8
Reviewed-on: https://go-review.googlesource.com/c/go/+/428496
Reviewed-by: Keith Randall <khr@google.com>
Run-TryBot: Wayne Zuo <wdvxdr@golangcn.org>
Reviewed-by: Joel Sing <joel@sing.id.au>
Reviewed-by: Cherry Mui <cherryyz@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-09-05 21:43:28 -06:00
|
|
|
// riscv64:"SLLI",-"AND",-"SLTIU", -"MOVW"
|
2021-09-21 10:46:00 -06:00
|
|
|
return v << uint64(29)
|
|
|
|
}
|
|
|
|
|
|
|
|
func rshConst32Ux64(v uint32) uint32 {
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"SRW"
|
cmd/compile: optimize right shifts of uint32 on riscv
The compiler is currently zero extending 32 bit unsigned integers to
64 bits before right shifting them using a 64 bit shift instruction.
There's no need to do this as RISC-V has instructions for right
shifting 32 bit unsigned values (srlw and srliw) which zero extend
the result of the shift to 64 bits. Change the compiler so that
it uses srlw and srliw for 32 bit unsigned shifts reducing in most
cases the number of instructions needed to perform the shift.
Here are some examples of code sequences that are changed by this
patch:
uint32(a) >> 2
before:
sll x5,x10,0x20
srl x10,x5,0x22
after:
srlw x10,x10,0x2
uint32(a) >> int(b)
before:
sll x5,x10,0x20
srl x5,x5,0x20
srl x5,x5,x11
sltiu x6,x11,64
neg x6,x6
and x10,x5,x6
after:
srlw x5,x10,x11
sltiu x6,x11,32
neg x6,x6
and x10,x5,x6
bits.RotateLeft32(uint32(a), 1)
before:
sll x5,x10,0x1
sll x6,x10,0x20
srl x7,x6,0x3f
or x5,x5,x7
after:
sll x5,x10,0x1
srlw x6,x10,0x1f
or x10,x5,x6
bits.RotateLeft32(uint32(a), int(b))
before:
and x6,x11,31
sll x7,x10,x6
sll x8,x10,0x20
srl x8,x8,0x20
add x6,x6,-32
neg x6,x6
srl x9,x8,x6
sltiu x6,x6,64
neg x6,x6
and x6,x9,x6
or x6,x6,x7
after:
and x5,x11,31
sll x6,x10,x5
add x5,x5,-32
neg x5,x5
srlw x7,x10,x5
sltiu x5,x5,32
neg x5,x5
and x5,x7,x5
or x10,x6,x5
The one regression observed is the following case, an unbounded right
shift of a uint32 where the value we're shifting by is known to be
< 64 but > 31. As this is an unusual case this commit does not
optimize for it, although the existing code does.
uint32(a) >> (b & 63)
before:
sll x5,x10,0x20
srl x5,x5,0x20
and x6,x11,63
srl x10,x5,x6
after
and x5,x11,63
srlw x6,x10,x5
sltiu x5,x5,32
neg x5,x5
and x10,x6,x5
Here we have one extra instruction.
Some benchmark highlights, generated on a VisionFive2 8GB running
Ubuntu 23.04.
pkg: math/bits
LeadingZeros32-4 18.64n ± 0% 17.32n ± 0% -7.11% (p=0.000 n=10)
LeadingZeros64-4 15.47n ± 0% 15.51n ± 0% +0.26% (p=0.027 n=10)
TrailingZeros16-4 18.48n ± 0% 17.68n ± 0% -4.33% (p=0.000 n=10)
TrailingZeros32-4 16.87n ± 0% 16.07n ± 0% -4.74% (p=0.000 n=10)
TrailingZeros64-4 15.26n ± 0% 15.27n ± 0% +0.07% (p=0.043 n=10)
OnesCount32-4 20.08n ± 0% 19.29n ± 0% -3.96% (p=0.000 n=10)
RotateLeft-4 8.864n ± 0% 8.838n ± 0% -0.30% (p=0.006 n=10)
RotateLeft32-4 8.837n ± 0% 8.032n ± 0% -9.11% (p=0.000 n=10)
Reverse32-4 29.77n ± 0% 26.52n ± 0% -10.93% (p=0.000 n=10)
ReverseBytes32-4 9.640n ± 0% 8.838n ± 0% -8.32% (p=0.000 n=10)
Sub32-4 8.835n ± 0% 8.035n ± 0% -9.06% (p=0.000 n=10)
geomean 11.50n 11.33n -1.45%
pkg: crypto/md5
Hash8Bytes-4 1.486µ ± 0% 1.426µ ± 0% -4.04% (p=0.000 n=10)
Hash64-4 2.079µ ± 0% 1.968µ ± 0% -5.36% (p=0.000 n=10)
Hash128-4 2.720µ ± 0% 2.557µ ± 0% -5.99% (p=0.000 n=10)
Hash256-4 3.996µ ± 0% 3.733µ ± 0% -6.58% (p=0.000 n=10)
Hash512-4 6.541µ ± 0% 6.072µ ± 0% -7.18% (p=0.000 n=10)
Hash1K-4 11.64µ ± 0% 10.75µ ± 0% -7.58% (p=0.000 n=10)
Hash8K-4 82.95µ ± 0% 76.32µ ± 0% -7.99% (p=0.000 n=10)
Hash1M-4 10.436m ± 0% 9.591m ± 0% -8.10% (p=0.000 n=10)
Hash8M-4 83.50m ± 0% 76.73m ± 0% -8.10% (p=0.000 n=10)
Hash8BytesUnaligned-4 1.494µ ± 0% 1.434µ ± 0% -4.02% (p=0.000 n=10)
Hash1KUnaligned-4 11.64µ ± 0% 10.76µ ± 0% -7.52% (p=0.000 n=10)
Hash8KUnaligned-4 83.01µ ± 0% 76.32µ ± 0% -8.07% (p=0.000 n=10)
geomean 28.32µ 26.42µ -6.72%
Change-Id: I20483a6668cca1b53fe83944bee3706aadcf8693
Reviewed-on: https://go-review.googlesource.com/c/go/+/528975
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Gopher Robot <gobot@golang.org>
2023-09-17 05:08:55 -06:00
|
|
|
// riscv64:"SRLIW",-"AND",-"SLTIU", -"MOVW"
|
2021-09-21 10:46:00 -06:00
|
|
|
return v >> uint64(29)
|
|
|
|
}
|
|
|
|
|
|
|
|
func rshConst32x64(v int32) int32 {
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"SRAW"
|
cmd/compile: optimize right shifts of int32 on riscv64
The compiler is currently sign extending 32 bit signed integers to
64 bits before right shifting them using a 64 bit shift instruction.
There's no need to do this as RISC-V has instructions for right
shifting 32 bit signed values (sraw and sraiw) which sign extend
the result of the shift to 64 bits. Change the compiler so that
it uses sraw and sraiw for shifts of signed 32 bit integers reducing
in most cases the number of instructions needed to perform the shift.
Here are some examples of code sequences that are changed by this
patch:
int32(a) >> 2
before:
sll x5,x10,0x20
sra x10,x5,0x22
after:
sraw x10,x10,0x2
int32(v) >> int(s)
before:
sext.w x5,x10
sltiu x6,x11,64
add x6,x6,-1
or x6,x11,x6
sra x10,x5,x6
after:
sltiu x5,x11,32
add x5,x5,-1
or x5,x11,x5
sraw x10,x10,x5
int32(v) >> (int(s) & 31)
before:
sext.w x5,x10
and x6,x11,63
sra x10,x5,x6
after:
and x5,x11,31
sraw x10,x10,x5
int32(100) >> int(a)
before:
bltz x10,<target address calls runtime.panicshift>
sltiu x5,x10,64
add x5,x5,-1
or x5,x10,x5
li x6,100
sra x10,x6,x5
after:
bltz x10,<target address calls runtime.panicshift>
sltiu x5,x10,32
add x5,x5,-1
or x5,x10,x5
li x6,100
sraw x10,x6,x5
int32(v) >> (int(s) & 63)
before:
sext.w x5,x10
and x6,x11,63
sra x10,x5,x6
after:
and x5,x11,63
sltiu x6,x5,32
add x6,x6,-1
or x5,x5,x6
sraw x10,x10,x5
In most cases we eliminate one instruction. In the case where
we shift a int32 constant by a variable the number of instructions
generated is identical. A sra is simply replaced by a sraw. In the
unusual case where we shift right by a variable anded with a constant
> 31 but < 64, we generate two additional instructions. As this is
an unusual case we do not try to optimize for it.
Some improvements can be seen in some of the existing benchmarks,
notably in the utf8 package which performs right shifts of runes
which are signed 32 bit integers.
| utf8-old | utf8-new |
| sec/op | sec/op vs base |
EncodeASCIIRune-4 17.68n ± 0% 17.67n ± 0% ~ (p=0.312 n=10)
EncodeJapaneseRune-4 35.34n ± 0% 34.53n ± 1% -2.31% (p=0.000 n=10)
AppendASCIIRune-4 3.213n ± 0% 3.213n ± 0% ~ (p=0.318 n=10)
AppendJapaneseRune-4 36.14n ± 0% 35.35n ± 0% -2.19% (p=0.000 n=10)
DecodeASCIIRune-4 28.11n ± 0% 27.36n ± 0% -2.69% (p=0.000 n=10)
DecodeJapaneseRune-4 38.55n ± 0% 38.58n ± 0% ~ (p=0.612 n=10)
Change-Id: I60a91cbede9ce65597571c7b7dd9943eeb8d3cc2
Reviewed-on: https://go-review.googlesource.com/c/go/+/535115
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Joel Sing <joel@sing.id.au>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: M Zhuo <mzh@golangcn.org>
Reviewed-by: David Chase <drchase@google.com>
2023-09-22 07:14:25 -06:00
|
|
|
// riscv64:"SRAIW",-"OR",-"SLTIU", -"MOVW"
|
2021-09-21 10:46:00 -06:00
|
|
|
return v >> uint64(29)
|
|
|
|
}
|
|
|
|
|
|
|
|
func lshConst64x32(v int64) int64 {
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"SLD"
|
2022-09-01 04:36:34 -06:00
|
|
|
// riscv64:"SLLI",-"AND",-"SLTIU"
|
2021-09-21 10:46:00 -06:00
|
|
|
return v << uint32(33)
|
|
|
|
}
|
|
|
|
|
|
|
|
func rshConst64Ux32(v uint64) uint64 {
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"SRD"
|
cmd/compile: optimize right shifts of uint32 on riscv
The compiler is currently zero extending 32 bit unsigned integers to
64 bits before right shifting them using a 64 bit shift instruction.
There's no need to do this as RISC-V has instructions for right
shifting 32 bit unsigned values (srlw and srliw) which zero extend
the result of the shift to 64 bits. Change the compiler so that
it uses srlw and srliw for 32 bit unsigned shifts reducing in most
cases the number of instructions needed to perform the shift.
Here are some examples of code sequences that are changed by this
patch:
uint32(a) >> 2
before:
sll x5,x10,0x20
srl x10,x5,0x22
after:
srlw x10,x10,0x2
uint32(a) >> int(b)
before:
sll x5,x10,0x20
srl x5,x5,0x20
srl x5,x5,x11
sltiu x6,x11,64
neg x6,x6
and x10,x5,x6
after:
srlw x5,x10,x11
sltiu x6,x11,32
neg x6,x6
and x10,x5,x6
bits.RotateLeft32(uint32(a), 1)
before:
sll x5,x10,0x1
sll x6,x10,0x20
srl x7,x6,0x3f
or x5,x5,x7
after:
sll x5,x10,0x1
srlw x6,x10,0x1f
or x10,x5,x6
bits.RotateLeft32(uint32(a), int(b))
before:
and x6,x11,31
sll x7,x10,x6
sll x8,x10,0x20
srl x8,x8,0x20
add x6,x6,-32
neg x6,x6
srl x9,x8,x6
sltiu x6,x6,64
neg x6,x6
and x6,x9,x6
or x6,x6,x7
after:
and x5,x11,31
sll x6,x10,x5
add x5,x5,-32
neg x5,x5
srlw x7,x10,x5
sltiu x5,x5,32
neg x5,x5
and x5,x7,x5
or x10,x6,x5
The one regression observed is the following case, an unbounded right
shift of a uint32 where the value we're shifting by is known to be
< 64 but > 31. As this is an unusual case this commit does not
optimize for it, although the existing code does.
uint32(a) >> (b & 63)
before:
sll x5,x10,0x20
srl x5,x5,0x20
and x6,x11,63
srl x10,x5,x6
after
and x5,x11,63
srlw x6,x10,x5
sltiu x5,x5,32
neg x5,x5
and x10,x6,x5
Here we have one extra instruction.
Some benchmark highlights, generated on a VisionFive2 8GB running
Ubuntu 23.04.
pkg: math/bits
LeadingZeros32-4 18.64n ± 0% 17.32n ± 0% -7.11% (p=0.000 n=10)
LeadingZeros64-4 15.47n ± 0% 15.51n ± 0% +0.26% (p=0.027 n=10)
TrailingZeros16-4 18.48n ± 0% 17.68n ± 0% -4.33% (p=0.000 n=10)
TrailingZeros32-4 16.87n ± 0% 16.07n ± 0% -4.74% (p=0.000 n=10)
TrailingZeros64-4 15.26n ± 0% 15.27n ± 0% +0.07% (p=0.043 n=10)
OnesCount32-4 20.08n ± 0% 19.29n ± 0% -3.96% (p=0.000 n=10)
RotateLeft-4 8.864n ± 0% 8.838n ± 0% -0.30% (p=0.006 n=10)
RotateLeft32-4 8.837n ± 0% 8.032n ± 0% -9.11% (p=0.000 n=10)
Reverse32-4 29.77n ± 0% 26.52n ± 0% -10.93% (p=0.000 n=10)
ReverseBytes32-4 9.640n ± 0% 8.838n ± 0% -8.32% (p=0.000 n=10)
Sub32-4 8.835n ± 0% 8.035n ± 0% -9.06% (p=0.000 n=10)
geomean 11.50n 11.33n -1.45%
pkg: crypto/md5
Hash8Bytes-4 1.486µ ± 0% 1.426µ ± 0% -4.04% (p=0.000 n=10)
Hash64-4 2.079µ ± 0% 1.968µ ± 0% -5.36% (p=0.000 n=10)
Hash128-4 2.720µ ± 0% 2.557µ ± 0% -5.99% (p=0.000 n=10)
Hash256-4 3.996µ ± 0% 3.733µ ± 0% -6.58% (p=0.000 n=10)
Hash512-4 6.541µ ± 0% 6.072µ ± 0% -7.18% (p=0.000 n=10)
Hash1K-4 11.64µ ± 0% 10.75µ ± 0% -7.58% (p=0.000 n=10)
Hash8K-4 82.95µ ± 0% 76.32µ ± 0% -7.99% (p=0.000 n=10)
Hash1M-4 10.436m ± 0% 9.591m ± 0% -8.10% (p=0.000 n=10)
Hash8M-4 83.50m ± 0% 76.73m ± 0% -8.10% (p=0.000 n=10)
Hash8BytesUnaligned-4 1.494µ ± 0% 1.434µ ± 0% -4.02% (p=0.000 n=10)
Hash1KUnaligned-4 11.64µ ± 0% 10.76µ ± 0% -7.52% (p=0.000 n=10)
Hash8KUnaligned-4 83.01µ ± 0% 76.32µ ± 0% -8.07% (p=0.000 n=10)
geomean 28.32µ 26.42µ -6.72%
Change-Id: I20483a6668cca1b53fe83944bee3706aadcf8693
Reviewed-on: https://go-review.googlesource.com/c/go/+/528975
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Gopher Robot <gobot@golang.org>
2023-09-17 05:08:55 -06:00
|
|
|
// riscv64:"SRLI\t",-"AND",-"SLTIU"
|
2021-09-21 10:46:00 -06:00
|
|
|
return v >> uint32(33)
|
|
|
|
}
|
|
|
|
|
|
|
|
func rshConst64x32(v int64) int64 {
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"SRAD"
|
cmd/compile: optimize right shifts of int32 on riscv64
The compiler is currently sign extending 32 bit signed integers to
64 bits before right shifting them using a 64 bit shift instruction.
There's no need to do this as RISC-V has instructions for right
shifting 32 bit signed values (sraw and sraiw) which sign extend
the result of the shift to 64 bits. Change the compiler so that
it uses sraw and sraiw for shifts of signed 32 bit integers reducing
in most cases the number of instructions needed to perform the shift.
Here are some examples of code sequences that are changed by this
patch:
int32(a) >> 2
before:
sll x5,x10,0x20
sra x10,x5,0x22
after:
sraw x10,x10,0x2
int32(v) >> int(s)
before:
sext.w x5,x10
sltiu x6,x11,64
add x6,x6,-1
or x6,x11,x6
sra x10,x5,x6
after:
sltiu x5,x11,32
add x5,x5,-1
or x5,x11,x5
sraw x10,x10,x5
int32(v) >> (int(s) & 31)
before:
sext.w x5,x10
and x6,x11,63
sra x10,x5,x6
after:
and x5,x11,31
sraw x10,x10,x5
int32(100) >> int(a)
before:
bltz x10,<target address calls runtime.panicshift>
sltiu x5,x10,64
add x5,x5,-1
or x5,x10,x5
li x6,100
sra x10,x6,x5
after:
bltz x10,<target address calls runtime.panicshift>
sltiu x5,x10,32
add x5,x5,-1
or x5,x10,x5
li x6,100
sraw x10,x6,x5
int32(v) >> (int(s) & 63)
before:
sext.w x5,x10
and x6,x11,63
sra x10,x5,x6
after:
and x5,x11,63
sltiu x6,x5,32
add x6,x6,-1
or x5,x5,x6
sraw x10,x10,x5
In most cases we eliminate one instruction. In the case where
we shift a int32 constant by a variable the number of instructions
generated is identical. A sra is simply replaced by a sraw. In the
unusual case where we shift right by a variable anded with a constant
> 31 but < 64, we generate two additional instructions. As this is
an unusual case we do not try to optimize for it.
Some improvements can be seen in some of the existing benchmarks,
notably in the utf8 package which performs right shifts of runes
which are signed 32 bit integers.
| utf8-old | utf8-new |
| sec/op | sec/op vs base |
EncodeASCIIRune-4 17.68n ± 0% 17.67n ± 0% ~ (p=0.312 n=10)
EncodeJapaneseRune-4 35.34n ± 0% 34.53n ± 1% -2.31% (p=0.000 n=10)
AppendASCIIRune-4 3.213n ± 0% 3.213n ± 0% ~ (p=0.318 n=10)
AppendJapaneseRune-4 36.14n ± 0% 35.35n ± 0% -2.19% (p=0.000 n=10)
DecodeASCIIRune-4 28.11n ± 0% 27.36n ± 0% -2.69% (p=0.000 n=10)
DecodeJapaneseRune-4 38.55n ± 0% 38.58n ± 0% ~ (p=0.612 n=10)
Change-Id: I60a91cbede9ce65597571c7b7dd9943eeb8d3cc2
Reviewed-on: https://go-review.googlesource.com/c/go/+/535115
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Joel Sing <joel@sing.id.au>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: M Zhuo <mzh@golangcn.org>
Reviewed-by: David Chase <drchase@google.com>
2023-09-22 07:14:25 -06:00
|
|
|
// riscv64:"SRAI\t",-"OR",-"SLTIU"
|
2021-09-21 10:46:00 -06:00
|
|
|
return v >> uint32(33)
|
|
|
|
}
|
|
|
|
|
2018-04-30 06:27:50 -06:00
|
|
|
// ------------------ //
|
|
|
|
// masked shifts //
|
|
|
|
// ------------------ //
|
|
|
|
|
|
|
|
func lshMask64x64(v int64, s uint64) int64 {
|
2022-09-01 04:36:34 -06:00
|
|
|
// arm64:"LSL",-"AND"
|
2023-09-27 11:15:04 -06:00
|
|
|
// ppc64x:"RLDICL",-"ORN",-"ISEL"
|
2022-08-10 10:07:40 -06:00
|
|
|
// riscv64:"SLL",-"AND\t",-"SLTIU"
|
2021-09-21 10:46:00 -06:00
|
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
2019-03-13 14:52:17 -06:00
|
|
|
return v << (s & 63)
|
2018-04-30 06:27:50 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
func rshMask64Ux64(v uint64, s uint64) uint64 {
|
2022-09-01 04:36:34 -06:00
|
|
|
// arm64:"LSR",-"AND",-"CSEL"
|
2023-09-27 11:15:04 -06:00
|
|
|
// ppc64x:"RLDICL",-"ORN",-"ISEL"
|
cmd/compile: optimize right shifts of uint32 on riscv
The compiler is currently zero extending 32 bit unsigned integers to
64 bits before right shifting them using a 64 bit shift instruction.
There's no need to do this as RISC-V has instructions for right
shifting 32 bit unsigned values (srlw and srliw) which zero extend
the result of the shift to 64 bits. Change the compiler so that
it uses srlw and srliw for 32 bit unsigned shifts reducing in most
cases the number of instructions needed to perform the shift.
Here are some examples of code sequences that are changed by this
patch:
uint32(a) >> 2
before:
sll x5,x10,0x20
srl x10,x5,0x22
after:
srlw x10,x10,0x2
uint32(a) >> int(b)
before:
sll x5,x10,0x20
srl x5,x5,0x20
srl x5,x5,x11
sltiu x6,x11,64
neg x6,x6
and x10,x5,x6
after:
srlw x5,x10,x11
sltiu x6,x11,32
neg x6,x6
and x10,x5,x6
bits.RotateLeft32(uint32(a), 1)
before:
sll x5,x10,0x1
sll x6,x10,0x20
srl x7,x6,0x3f
or x5,x5,x7
after:
sll x5,x10,0x1
srlw x6,x10,0x1f
or x10,x5,x6
bits.RotateLeft32(uint32(a), int(b))
before:
and x6,x11,31
sll x7,x10,x6
sll x8,x10,0x20
srl x8,x8,0x20
add x6,x6,-32
neg x6,x6
srl x9,x8,x6
sltiu x6,x6,64
neg x6,x6
and x6,x9,x6
or x6,x6,x7
after:
and x5,x11,31
sll x6,x10,x5
add x5,x5,-32
neg x5,x5
srlw x7,x10,x5
sltiu x5,x5,32
neg x5,x5
and x5,x7,x5
or x10,x6,x5
The one regression observed is the following case, an unbounded right
shift of a uint32 where the value we're shifting by is known to be
< 64 but > 31. As this is an unusual case this commit does not
optimize for it, although the existing code does.
uint32(a) >> (b & 63)
before:
sll x5,x10,0x20
srl x5,x5,0x20
and x6,x11,63
srl x10,x5,x6
after
and x5,x11,63
srlw x6,x10,x5
sltiu x5,x5,32
neg x5,x5
and x10,x6,x5
Here we have one extra instruction.
Some benchmark highlights, generated on a VisionFive2 8GB running
Ubuntu 23.04.
pkg: math/bits
LeadingZeros32-4 18.64n ± 0% 17.32n ± 0% -7.11% (p=0.000 n=10)
LeadingZeros64-4 15.47n ± 0% 15.51n ± 0% +0.26% (p=0.027 n=10)
TrailingZeros16-4 18.48n ± 0% 17.68n ± 0% -4.33% (p=0.000 n=10)
TrailingZeros32-4 16.87n ± 0% 16.07n ± 0% -4.74% (p=0.000 n=10)
TrailingZeros64-4 15.26n ± 0% 15.27n ± 0% +0.07% (p=0.043 n=10)
OnesCount32-4 20.08n ± 0% 19.29n ± 0% -3.96% (p=0.000 n=10)
RotateLeft-4 8.864n ± 0% 8.838n ± 0% -0.30% (p=0.006 n=10)
RotateLeft32-4 8.837n ± 0% 8.032n ± 0% -9.11% (p=0.000 n=10)
Reverse32-4 29.77n ± 0% 26.52n ± 0% -10.93% (p=0.000 n=10)
ReverseBytes32-4 9.640n ± 0% 8.838n ± 0% -8.32% (p=0.000 n=10)
Sub32-4 8.835n ± 0% 8.035n ± 0% -9.06% (p=0.000 n=10)
geomean 11.50n 11.33n -1.45%
pkg: crypto/md5
Hash8Bytes-4 1.486µ ± 0% 1.426µ ± 0% -4.04% (p=0.000 n=10)
Hash64-4 2.079µ ± 0% 1.968µ ± 0% -5.36% (p=0.000 n=10)
Hash128-4 2.720µ ± 0% 2.557µ ± 0% -5.99% (p=0.000 n=10)
Hash256-4 3.996µ ± 0% 3.733µ ± 0% -6.58% (p=0.000 n=10)
Hash512-4 6.541µ ± 0% 6.072µ ± 0% -7.18% (p=0.000 n=10)
Hash1K-4 11.64µ ± 0% 10.75µ ± 0% -7.58% (p=0.000 n=10)
Hash8K-4 82.95µ ± 0% 76.32µ ± 0% -7.99% (p=0.000 n=10)
Hash1M-4 10.436m ± 0% 9.591m ± 0% -8.10% (p=0.000 n=10)
Hash8M-4 83.50m ± 0% 76.73m ± 0% -8.10% (p=0.000 n=10)
Hash8BytesUnaligned-4 1.494µ ± 0% 1.434µ ± 0% -4.02% (p=0.000 n=10)
Hash1KUnaligned-4 11.64µ ± 0% 10.76µ ± 0% -7.52% (p=0.000 n=10)
Hash8KUnaligned-4 83.01µ ± 0% 76.32µ ± 0% -8.07% (p=0.000 n=10)
geomean 28.32µ 26.42µ -6.72%
Change-Id: I20483a6668cca1b53fe83944bee3706aadcf8693
Reviewed-on: https://go-review.googlesource.com/c/go/+/528975
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Gopher Robot <gobot@golang.org>
2023-09-17 05:08:55 -06:00
|
|
|
// riscv64:"SRL\t",-"AND\t",-"SLTIU"
|
2021-09-21 10:46:00 -06:00
|
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
2019-03-13 14:52:17 -06:00
|
|
|
return v >> (s & 63)
|
2018-04-30 06:27:50 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
func rshMask64x64(v int64, s uint64) int64 {
|
2022-09-01 04:36:34 -06:00
|
|
|
// arm64:"ASR",-"AND",-"CSEL"
|
2023-09-27 11:15:04 -06:00
|
|
|
// ppc64x:"RLDICL",-"ORN",-"ISEL"
|
cmd/compile: optimize right shifts of int32 on riscv64
The compiler is currently sign extending 32 bit signed integers to
64 bits before right shifting them using a 64 bit shift instruction.
There's no need to do this as RISC-V has instructions for right
shifting 32 bit signed values (sraw and sraiw) which sign extend
the result of the shift to 64 bits. Change the compiler so that
it uses sraw and sraiw for shifts of signed 32 bit integers reducing
in most cases the number of instructions needed to perform the shift.
Here are some examples of code sequences that are changed by this
patch:
int32(a) >> 2
before:
sll x5,x10,0x20
sra x10,x5,0x22
after:
sraw x10,x10,0x2
int32(v) >> int(s)
before:
sext.w x5,x10
sltiu x6,x11,64
add x6,x6,-1
or x6,x11,x6
sra x10,x5,x6
after:
sltiu x5,x11,32
add x5,x5,-1
or x5,x11,x5
sraw x10,x10,x5
int32(v) >> (int(s) & 31)
before:
sext.w x5,x10
and x6,x11,63
sra x10,x5,x6
after:
and x5,x11,31
sraw x10,x10,x5
int32(100) >> int(a)
before:
bltz x10,<target address calls runtime.panicshift>
sltiu x5,x10,64
add x5,x5,-1
or x5,x10,x5
li x6,100
sra x10,x6,x5
after:
bltz x10,<target address calls runtime.panicshift>
sltiu x5,x10,32
add x5,x5,-1
or x5,x10,x5
li x6,100
sraw x10,x6,x5
int32(v) >> (int(s) & 63)
before:
sext.w x5,x10
and x6,x11,63
sra x10,x5,x6
after:
and x5,x11,63
sltiu x6,x5,32
add x6,x6,-1
or x5,x5,x6
sraw x10,x10,x5
In most cases we eliminate one instruction. In the case where
we shift a int32 constant by a variable the number of instructions
generated is identical. A sra is simply replaced by a sraw. In the
unusual case where we shift right by a variable anded with a constant
> 31 but < 64, we generate two additional instructions. As this is
an unusual case we do not try to optimize for it.
Some improvements can be seen in some of the existing benchmarks,
notably in the utf8 package which performs right shifts of runes
which are signed 32 bit integers.
| utf8-old | utf8-new |
| sec/op | sec/op vs base |
EncodeASCIIRune-4 17.68n ± 0% 17.67n ± 0% ~ (p=0.312 n=10)
EncodeJapaneseRune-4 35.34n ± 0% 34.53n ± 1% -2.31% (p=0.000 n=10)
AppendASCIIRune-4 3.213n ± 0% 3.213n ± 0% ~ (p=0.318 n=10)
AppendJapaneseRune-4 36.14n ± 0% 35.35n ± 0% -2.19% (p=0.000 n=10)
DecodeASCIIRune-4 28.11n ± 0% 27.36n ± 0% -2.69% (p=0.000 n=10)
DecodeJapaneseRune-4 38.55n ± 0% 38.58n ± 0% ~ (p=0.612 n=10)
Change-Id: I60a91cbede9ce65597571c7b7dd9943eeb8d3cc2
Reviewed-on: https://go-review.googlesource.com/c/go/+/535115
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Joel Sing <joel@sing.id.au>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: M Zhuo <mzh@golangcn.org>
Reviewed-by: David Chase <drchase@google.com>
2023-09-22 07:14:25 -06:00
|
|
|
// riscv64:"SRA\t",-"OR",-"SLTIU"
|
2021-09-21 10:46:00 -06:00
|
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
2019-03-13 14:52:17 -06:00
|
|
|
return v >> (s & 63)
|
2018-04-30 06:27:50 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
func lshMask32x64(v int32, s uint64) int32 {
|
2022-09-01 04:36:34 -06:00
|
|
|
// arm64:"LSL",-"AND"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"ISEL",-"ORN"
|
2022-09-01 04:36:34 -06:00
|
|
|
// riscv64:"SLL",-"AND\t",-"SLTIU"
|
2021-09-21 10:46:00 -06:00
|
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
2019-03-13 14:52:17 -06:00
|
|
|
return v << (s & 63)
|
2018-04-30 06:27:50 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
func rshMask32Ux64(v uint32, s uint64) uint32 {
|
2022-09-01 04:36:34 -06:00
|
|
|
// arm64:"LSR",-"AND"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"ISEL",-"ORN"
|
cmd/compile: optimize right shifts of uint32 on riscv
The compiler is currently zero extending 32 bit unsigned integers to
64 bits before right shifting them using a 64 bit shift instruction.
There's no need to do this as RISC-V has instructions for right
shifting 32 bit unsigned values (srlw and srliw) which zero extend
the result of the shift to 64 bits. Change the compiler so that
it uses srlw and srliw for 32 bit unsigned shifts reducing in most
cases the number of instructions needed to perform the shift.
Here are some examples of code sequences that are changed by this
patch:
uint32(a) >> 2
before:
sll x5,x10,0x20
srl x10,x5,0x22
after:
srlw x10,x10,0x2
uint32(a) >> int(b)
before:
sll x5,x10,0x20
srl x5,x5,0x20
srl x5,x5,x11
sltiu x6,x11,64
neg x6,x6
and x10,x5,x6
after:
srlw x5,x10,x11
sltiu x6,x11,32
neg x6,x6
and x10,x5,x6
bits.RotateLeft32(uint32(a), 1)
before:
sll x5,x10,0x1
sll x6,x10,0x20
srl x7,x6,0x3f
or x5,x5,x7
after:
sll x5,x10,0x1
srlw x6,x10,0x1f
or x10,x5,x6
bits.RotateLeft32(uint32(a), int(b))
before:
and x6,x11,31
sll x7,x10,x6
sll x8,x10,0x20
srl x8,x8,0x20
add x6,x6,-32
neg x6,x6
srl x9,x8,x6
sltiu x6,x6,64
neg x6,x6
and x6,x9,x6
or x6,x6,x7
after:
and x5,x11,31
sll x6,x10,x5
add x5,x5,-32
neg x5,x5
srlw x7,x10,x5
sltiu x5,x5,32
neg x5,x5
and x5,x7,x5
or x10,x6,x5
The one regression observed is the following case, an unbounded right
shift of a uint32 where the value we're shifting by is known to be
< 64 but > 31. As this is an unusual case this commit does not
optimize for it, although the existing code does.
uint32(a) >> (b & 63)
before:
sll x5,x10,0x20
srl x5,x5,0x20
and x6,x11,63
srl x10,x5,x6
after
and x5,x11,63
srlw x6,x10,x5
sltiu x5,x5,32
neg x5,x5
and x10,x6,x5
Here we have one extra instruction.
Some benchmark highlights, generated on a VisionFive2 8GB running
Ubuntu 23.04.
pkg: math/bits
LeadingZeros32-4 18.64n ± 0% 17.32n ± 0% -7.11% (p=0.000 n=10)
LeadingZeros64-4 15.47n ± 0% 15.51n ± 0% +0.26% (p=0.027 n=10)
TrailingZeros16-4 18.48n ± 0% 17.68n ± 0% -4.33% (p=0.000 n=10)
TrailingZeros32-4 16.87n ± 0% 16.07n ± 0% -4.74% (p=0.000 n=10)
TrailingZeros64-4 15.26n ± 0% 15.27n ± 0% +0.07% (p=0.043 n=10)
OnesCount32-4 20.08n ± 0% 19.29n ± 0% -3.96% (p=0.000 n=10)
RotateLeft-4 8.864n ± 0% 8.838n ± 0% -0.30% (p=0.006 n=10)
RotateLeft32-4 8.837n ± 0% 8.032n ± 0% -9.11% (p=0.000 n=10)
Reverse32-4 29.77n ± 0% 26.52n ± 0% -10.93% (p=0.000 n=10)
ReverseBytes32-4 9.640n ± 0% 8.838n ± 0% -8.32% (p=0.000 n=10)
Sub32-4 8.835n ± 0% 8.035n ± 0% -9.06% (p=0.000 n=10)
geomean 11.50n 11.33n -1.45%
pkg: crypto/md5
Hash8Bytes-4 1.486µ ± 0% 1.426µ ± 0% -4.04% (p=0.000 n=10)
Hash64-4 2.079µ ± 0% 1.968µ ± 0% -5.36% (p=0.000 n=10)
Hash128-4 2.720µ ± 0% 2.557µ ± 0% -5.99% (p=0.000 n=10)
Hash256-4 3.996µ ± 0% 3.733µ ± 0% -6.58% (p=0.000 n=10)
Hash512-4 6.541µ ± 0% 6.072µ ± 0% -7.18% (p=0.000 n=10)
Hash1K-4 11.64µ ± 0% 10.75µ ± 0% -7.58% (p=0.000 n=10)
Hash8K-4 82.95µ ± 0% 76.32µ ± 0% -7.99% (p=0.000 n=10)
Hash1M-4 10.436m ± 0% 9.591m ± 0% -8.10% (p=0.000 n=10)
Hash8M-4 83.50m ± 0% 76.73m ± 0% -8.10% (p=0.000 n=10)
Hash8BytesUnaligned-4 1.494µ ± 0% 1.434µ ± 0% -4.02% (p=0.000 n=10)
Hash1KUnaligned-4 11.64µ ± 0% 10.76µ ± 0% -7.52% (p=0.000 n=10)
Hash8KUnaligned-4 83.01µ ± 0% 76.32µ ± 0% -8.07% (p=0.000 n=10)
geomean 28.32µ 26.42µ -6.72%
Change-Id: I20483a6668cca1b53fe83944bee3706aadcf8693
Reviewed-on: https://go-review.googlesource.com/c/go/+/528975
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Gopher Robot <gobot@golang.org>
2023-09-17 05:08:55 -06:00
|
|
|
// riscv64:"SRLW","SLTIU","NEG","AND\t",-"SRL\t"
|
2021-09-21 10:46:00 -06:00
|
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
2019-03-13 14:52:17 -06:00
|
|
|
return v >> (s & 63)
|
2018-04-30 06:27:50 -06:00
|
|
|
}
|
|
|
|
|
cmd/compile: optimize right shifts of uint32 on riscv
The compiler is currently zero extending 32 bit unsigned integers to
64 bits before right shifting them using a 64 bit shift instruction.
There's no need to do this as RISC-V has instructions for right
shifting 32 bit unsigned values (srlw and srliw) which zero extend
the result of the shift to 64 bits. Change the compiler so that
it uses srlw and srliw for 32 bit unsigned shifts reducing in most
cases the number of instructions needed to perform the shift.
Here are some examples of code sequences that are changed by this
patch:
uint32(a) >> 2
before:
sll x5,x10,0x20
srl x10,x5,0x22
after:
srlw x10,x10,0x2
uint32(a) >> int(b)
before:
sll x5,x10,0x20
srl x5,x5,0x20
srl x5,x5,x11
sltiu x6,x11,64
neg x6,x6
and x10,x5,x6
after:
srlw x5,x10,x11
sltiu x6,x11,32
neg x6,x6
and x10,x5,x6
bits.RotateLeft32(uint32(a), 1)
before:
sll x5,x10,0x1
sll x6,x10,0x20
srl x7,x6,0x3f
or x5,x5,x7
after:
sll x5,x10,0x1
srlw x6,x10,0x1f
or x10,x5,x6
bits.RotateLeft32(uint32(a), int(b))
before:
and x6,x11,31
sll x7,x10,x6
sll x8,x10,0x20
srl x8,x8,0x20
add x6,x6,-32
neg x6,x6
srl x9,x8,x6
sltiu x6,x6,64
neg x6,x6
and x6,x9,x6
or x6,x6,x7
after:
and x5,x11,31
sll x6,x10,x5
add x5,x5,-32
neg x5,x5
srlw x7,x10,x5
sltiu x5,x5,32
neg x5,x5
and x5,x7,x5
or x10,x6,x5
The one regression observed is the following case, an unbounded right
shift of a uint32 where the value we're shifting by is known to be
< 64 but > 31. As this is an unusual case this commit does not
optimize for it, although the existing code does.
uint32(a) >> (b & 63)
before:
sll x5,x10,0x20
srl x5,x5,0x20
and x6,x11,63
srl x10,x5,x6
after
and x5,x11,63
srlw x6,x10,x5
sltiu x5,x5,32
neg x5,x5
and x10,x6,x5
Here we have one extra instruction.
Some benchmark highlights, generated on a VisionFive2 8GB running
Ubuntu 23.04.
pkg: math/bits
LeadingZeros32-4 18.64n ± 0% 17.32n ± 0% -7.11% (p=0.000 n=10)
LeadingZeros64-4 15.47n ± 0% 15.51n ± 0% +0.26% (p=0.027 n=10)
TrailingZeros16-4 18.48n ± 0% 17.68n ± 0% -4.33% (p=0.000 n=10)
TrailingZeros32-4 16.87n ± 0% 16.07n ± 0% -4.74% (p=0.000 n=10)
TrailingZeros64-4 15.26n ± 0% 15.27n ± 0% +0.07% (p=0.043 n=10)
OnesCount32-4 20.08n ± 0% 19.29n ± 0% -3.96% (p=0.000 n=10)
RotateLeft-4 8.864n ± 0% 8.838n ± 0% -0.30% (p=0.006 n=10)
RotateLeft32-4 8.837n ± 0% 8.032n ± 0% -9.11% (p=0.000 n=10)
Reverse32-4 29.77n ± 0% 26.52n ± 0% -10.93% (p=0.000 n=10)
ReverseBytes32-4 9.640n ± 0% 8.838n ± 0% -8.32% (p=0.000 n=10)
Sub32-4 8.835n ± 0% 8.035n ± 0% -9.06% (p=0.000 n=10)
geomean 11.50n 11.33n -1.45%
pkg: crypto/md5
Hash8Bytes-4 1.486µ ± 0% 1.426µ ± 0% -4.04% (p=0.000 n=10)
Hash64-4 2.079µ ± 0% 1.968µ ± 0% -5.36% (p=0.000 n=10)
Hash128-4 2.720µ ± 0% 2.557µ ± 0% -5.99% (p=0.000 n=10)
Hash256-4 3.996µ ± 0% 3.733µ ± 0% -6.58% (p=0.000 n=10)
Hash512-4 6.541µ ± 0% 6.072µ ± 0% -7.18% (p=0.000 n=10)
Hash1K-4 11.64µ ± 0% 10.75µ ± 0% -7.58% (p=0.000 n=10)
Hash8K-4 82.95µ ± 0% 76.32µ ± 0% -7.99% (p=0.000 n=10)
Hash1M-4 10.436m ± 0% 9.591m ± 0% -8.10% (p=0.000 n=10)
Hash8M-4 83.50m ± 0% 76.73m ± 0% -8.10% (p=0.000 n=10)
Hash8BytesUnaligned-4 1.494µ ± 0% 1.434µ ± 0% -4.02% (p=0.000 n=10)
Hash1KUnaligned-4 11.64µ ± 0% 10.76µ ± 0% -7.52% (p=0.000 n=10)
Hash8KUnaligned-4 83.01µ ± 0% 76.32µ ± 0% -8.07% (p=0.000 n=10)
geomean 28.32µ 26.42µ -6.72%
Change-Id: I20483a6668cca1b53fe83944bee3706aadcf8693
Reviewed-on: https://go-review.googlesource.com/c/go/+/528975
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Gopher Robot <gobot@golang.org>
2023-09-17 05:08:55 -06:00
|
|
|
func rsh5Mask32Ux64(v uint32, s uint64) uint32 {
|
|
|
|
// riscv64:"SRLW",-"AND\t",-"SLTIU",-"SRL\t"
|
|
|
|
return v >> (s & 31)
|
|
|
|
}
|
|
|
|
|
2018-04-30 06:27:50 -06:00
|
|
|
func rshMask32x64(v int32, s uint64) int32 {
|
2022-09-01 04:36:34 -06:00
|
|
|
// arm64:"ASR",-"AND"
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"ISEL",-"ORN"
|
cmd/compile: optimize right shifts of int32 on riscv64
The compiler is currently sign extending 32 bit signed integers to
64 bits before right shifting them using a 64 bit shift instruction.
There's no need to do this as RISC-V has instructions for right
shifting 32 bit signed values (sraw and sraiw) which sign extend
the result of the shift to 64 bits. Change the compiler so that
it uses sraw and sraiw for shifts of signed 32 bit integers reducing
in most cases the number of instructions needed to perform the shift.
Here are some examples of code sequences that are changed by this
patch:
int32(a) >> 2
before:
sll x5,x10,0x20
sra x10,x5,0x22
after:
sraw x10,x10,0x2
int32(v) >> int(s)
before:
sext.w x5,x10
sltiu x6,x11,64
add x6,x6,-1
or x6,x11,x6
sra x10,x5,x6
after:
sltiu x5,x11,32
add x5,x5,-1
or x5,x11,x5
sraw x10,x10,x5
int32(v) >> (int(s) & 31)
before:
sext.w x5,x10
and x6,x11,63
sra x10,x5,x6
after:
and x5,x11,31
sraw x10,x10,x5
int32(100) >> int(a)
before:
bltz x10,<target address calls runtime.panicshift>
sltiu x5,x10,64
add x5,x5,-1
or x5,x10,x5
li x6,100
sra x10,x6,x5
after:
bltz x10,<target address calls runtime.panicshift>
sltiu x5,x10,32
add x5,x5,-1
or x5,x10,x5
li x6,100
sraw x10,x6,x5
int32(v) >> (int(s) & 63)
before:
sext.w x5,x10
and x6,x11,63
sra x10,x5,x6
after:
and x5,x11,63
sltiu x6,x5,32
add x6,x6,-1
or x5,x5,x6
sraw x10,x10,x5
In most cases we eliminate one instruction. In the case where
we shift a int32 constant by a variable the number of instructions
generated is identical. A sra is simply replaced by a sraw. In the
unusual case where we shift right by a variable anded with a constant
> 31 but < 64, we generate two additional instructions. As this is
an unusual case we do not try to optimize for it.
Some improvements can be seen in some of the existing benchmarks,
notably in the utf8 package which performs right shifts of runes
which are signed 32 bit integers.
| utf8-old | utf8-new |
| sec/op | sec/op vs base |
EncodeASCIIRune-4 17.68n ± 0% 17.67n ± 0% ~ (p=0.312 n=10)
EncodeJapaneseRune-4 35.34n ± 0% 34.53n ± 1% -2.31% (p=0.000 n=10)
AppendASCIIRune-4 3.213n ± 0% 3.213n ± 0% ~ (p=0.318 n=10)
AppendJapaneseRune-4 36.14n ± 0% 35.35n ± 0% -2.19% (p=0.000 n=10)
DecodeASCIIRune-4 28.11n ± 0% 27.36n ± 0% -2.69% (p=0.000 n=10)
DecodeJapaneseRune-4 38.55n ± 0% 38.58n ± 0% ~ (p=0.612 n=10)
Change-Id: I60a91cbede9ce65597571c7b7dd9943eeb8d3cc2
Reviewed-on: https://go-review.googlesource.com/c/go/+/535115
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Joel Sing <joel@sing.id.au>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: M Zhuo <mzh@golangcn.org>
Reviewed-by: David Chase <drchase@google.com>
2023-09-22 07:14:25 -06:00
|
|
|
// riscv64:"SRAW","OR","SLTIU"
|
2021-09-21 10:46:00 -06:00
|
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
2019-03-13 14:52:17 -06:00
|
|
|
return v >> (s & 63)
|
2018-04-30 06:27:50 -06:00
|
|
|
}
|
|
|
|
|
cmd/compile: optimize right shifts of int32 on riscv64
The compiler is currently sign extending 32 bit signed integers to
64 bits before right shifting them using a 64 bit shift instruction.
There's no need to do this as RISC-V has instructions for right
shifting 32 bit signed values (sraw and sraiw) which sign extend
the result of the shift to 64 bits. Change the compiler so that
it uses sraw and sraiw for shifts of signed 32 bit integers reducing
in most cases the number of instructions needed to perform the shift.
Here are some examples of code sequences that are changed by this
patch:
int32(a) >> 2
before:
sll x5,x10,0x20
sra x10,x5,0x22
after:
sraw x10,x10,0x2
int32(v) >> int(s)
before:
sext.w x5,x10
sltiu x6,x11,64
add x6,x6,-1
or x6,x11,x6
sra x10,x5,x6
after:
sltiu x5,x11,32
add x5,x5,-1
or x5,x11,x5
sraw x10,x10,x5
int32(v) >> (int(s) & 31)
before:
sext.w x5,x10
and x6,x11,63
sra x10,x5,x6
after:
and x5,x11,31
sraw x10,x10,x5
int32(100) >> int(a)
before:
bltz x10,<target address calls runtime.panicshift>
sltiu x5,x10,64
add x5,x5,-1
or x5,x10,x5
li x6,100
sra x10,x6,x5
after:
bltz x10,<target address calls runtime.panicshift>
sltiu x5,x10,32
add x5,x5,-1
or x5,x10,x5
li x6,100
sraw x10,x6,x5
int32(v) >> (int(s) & 63)
before:
sext.w x5,x10
and x6,x11,63
sra x10,x5,x6
after:
and x5,x11,63
sltiu x6,x5,32
add x6,x6,-1
or x5,x5,x6
sraw x10,x10,x5
In most cases we eliminate one instruction. In the case where
we shift a int32 constant by a variable the number of instructions
generated is identical. A sra is simply replaced by a sraw. In the
unusual case where we shift right by a variable anded with a constant
> 31 but < 64, we generate two additional instructions. As this is
an unusual case we do not try to optimize for it.
Some improvements can be seen in some of the existing benchmarks,
notably in the utf8 package which performs right shifts of runes
which are signed 32 bit integers.
| utf8-old | utf8-new |
| sec/op | sec/op vs base |
EncodeASCIIRune-4 17.68n ± 0% 17.67n ± 0% ~ (p=0.312 n=10)
EncodeJapaneseRune-4 35.34n ± 0% 34.53n ± 1% -2.31% (p=0.000 n=10)
AppendASCIIRune-4 3.213n ± 0% 3.213n ± 0% ~ (p=0.318 n=10)
AppendJapaneseRune-4 36.14n ± 0% 35.35n ± 0% -2.19% (p=0.000 n=10)
DecodeASCIIRune-4 28.11n ± 0% 27.36n ± 0% -2.69% (p=0.000 n=10)
DecodeJapaneseRune-4 38.55n ± 0% 38.58n ± 0% ~ (p=0.612 n=10)
Change-Id: I60a91cbede9ce65597571c7b7dd9943eeb8d3cc2
Reviewed-on: https://go-review.googlesource.com/c/go/+/535115
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Joel Sing <joel@sing.id.au>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: M Zhuo <mzh@golangcn.org>
Reviewed-by: David Chase <drchase@google.com>
2023-09-22 07:14:25 -06:00
|
|
|
func rsh5Mask32x64(v int32, s uint64) int32 {
|
|
|
|
// riscv64:"SRAW",-"OR",-"SLTIU"
|
|
|
|
return v >> (s & 31)
|
|
|
|
}
|
|
|
|
|
2018-04-30 06:27:50 -06:00
|
|
|
func lshMask64x32(v int64, s uint32) int64 {
|
2022-09-01 04:36:34 -06:00
|
|
|
// arm64:"LSL",-"AND"
|
2023-09-27 11:15:04 -06:00
|
|
|
// ppc64x:"RLDICL",-"ORN"
|
2022-08-10 10:07:40 -06:00
|
|
|
// riscv64:"SLL",-"AND\t",-"SLTIU"
|
2021-09-21 10:46:00 -06:00
|
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
2019-03-13 14:52:17 -06:00
|
|
|
return v << (s & 63)
|
2018-04-30 06:27:50 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
func rshMask64Ux32(v uint64, s uint32) uint64 {
|
2022-09-01 04:36:34 -06:00
|
|
|
// arm64:"LSR",-"AND",-"CSEL"
|
2023-09-27 11:15:04 -06:00
|
|
|
// ppc64x:"RLDICL",-"ORN"
|
cmd/compile: optimize right shifts of uint32 on riscv
The compiler is currently zero extending 32 bit unsigned integers to
64 bits before right shifting them using a 64 bit shift instruction.
There's no need to do this as RISC-V has instructions for right
shifting 32 bit unsigned values (srlw and srliw) which zero extend
the result of the shift to 64 bits. Change the compiler so that
it uses srlw and srliw for 32 bit unsigned shifts reducing in most
cases the number of instructions needed to perform the shift.
Here are some examples of code sequences that are changed by this
patch:
uint32(a) >> 2
before:
sll x5,x10,0x20
srl x10,x5,0x22
after:
srlw x10,x10,0x2
uint32(a) >> int(b)
before:
sll x5,x10,0x20
srl x5,x5,0x20
srl x5,x5,x11
sltiu x6,x11,64
neg x6,x6
and x10,x5,x6
after:
srlw x5,x10,x11
sltiu x6,x11,32
neg x6,x6
and x10,x5,x6
bits.RotateLeft32(uint32(a), 1)
before:
sll x5,x10,0x1
sll x6,x10,0x20
srl x7,x6,0x3f
or x5,x5,x7
after:
sll x5,x10,0x1
srlw x6,x10,0x1f
or x10,x5,x6
bits.RotateLeft32(uint32(a), int(b))
before:
and x6,x11,31
sll x7,x10,x6
sll x8,x10,0x20
srl x8,x8,0x20
add x6,x6,-32
neg x6,x6
srl x9,x8,x6
sltiu x6,x6,64
neg x6,x6
and x6,x9,x6
or x6,x6,x7
after:
and x5,x11,31
sll x6,x10,x5
add x5,x5,-32
neg x5,x5
srlw x7,x10,x5
sltiu x5,x5,32
neg x5,x5
and x5,x7,x5
or x10,x6,x5
The one regression observed is the following case, an unbounded right
shift of a uint32 where the value we're shifting by is known to be
< 64 but > 31. As this is an unusual case this commit does not
optimize for it, although the existing code does.
uint32(a) >> (b & 63)
before:
sll x5,x10,0x20
srl x5,x5,0x20
and x6,x11,63
srl x10,x5,x6
after
and x5,x11,63
srlw x6,x10,x5
sltiu x5,x5,32
neg x5,x5
and x10,x6,x5
Here we have one extra instruction.
Some benchmark highlights, generated on a VisionFive2 8GB running
Ubuntu 23.04.
pkg: math/bits
LeadingZeros32-4 18.64n ± 0% 17.32n ± 0% -7.11% (p=0.000 n=10)
LeadingZeros64-4 15.47n ± 0% 15.51n ± 0% +0.26% (p=0.027 n=10)
TrailingZeros16-4 18.48n ± 0% 17.68n ± 0% -4.33% (p=0.000 n=10)
TrailingZeros32-4 16.87n ± 0% 16.07n ± 0% -4.74% (p=0.000 n=10)
TrailingZeros64-4 15.26n ± 0% 15.27n ± 0% +0.07% (p=0.043 n=10)
OnesCount32-4 20.08n ± 0% 19.29n ± 0% -3.96% (p=0.000 n=10)
RotateLeft-4 8.864n ± 0% 8.838n ± 0% -0.30% (p=0.006 n=10)
RotateLeft32-4 8.837n ± 0% 8.032n ± 0% -9.11% (p=0.000 n=10)
Reverse32-4 29.77n ± 0% 26.52n ± 0% -10.93% (p=0.000 n=10)
ReverseBytes32-4 9.640n ± 0% 8.838n ± 0% -8.32% (p=0.000 n=10)
Sub32-4 8.835n ± 0% 8.035n ± 0% -9.06% (p=0.000 n=10)
geomean 11.50n 11.33n -1.45%
pkg: crypto/md5
Hash8Bytes-4 1.486µ ± 0% 1.426µ ± 0% -4.04% (p=0.000 n=10)
Hash64-4 2.079µ ± 0% 1.968µ ± 0% -5.36% (p=0.000 n=10)
Hash128-4 2.720µ ± 0% 2.557µ ± 0% -5.99% (p=0.000 n=10)
Hash256-4 3.996µ ± 0% 3.733µ ± 0% -6.58% (p=0.000 n=10)
Hash512-4 6.541µ ± 0% 6.072µ ± 0% -7.18% (p=0.000 n=10)
Hash1K-4 11.64µ ± 0% 10.75µ ± 0% -7.58% (p=0.000 n=10)
Hash8K-4 82.95µ ± 0% 76.32µ ± 0% -7.99% (p=0.000 n=10)
Hash1M-4 10.436m ± 0% 9.591m ± 0% -8.10% (p=0.000 n=10)
Hash8M-4 83.50m ± 0% 76.73m ± 0% -8.10% (p=0.000 n=10)
Hash8BytesUnaligned-4 1.494µ ± 0% 1.434µ ± 0% -4.02% (p=0.000 n=10)
Hash1KUnaligned-4 11.64µ ± 0% 10.76µ ± 0% -7.52% (p=0.000 n=10)
Hash8KUnaligned-4 83.01µ ± 0% 76.32µ ± 0% -8.07% (p=0.000 n=10)
geomean 28.32µ 26.42µ -6.72%
Change-Id: I20483a6668cca1b53fe83944bee3706aadcf8693
Reviewed-on: https://go-review.googlesource.com/c/go/+/528975
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Gopher Robot <gobot@golang.org>
2023-09-17 05:08:55 -06:00
|
|
|
// riscv64:"SRL\t",-"AND\t",-"SLTIU"
|
2021-09-21 10:46:00 -06:00
|
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
2019-03-13 14:52:17 -06:00
|
|
|
return v >> (s & 63)
|
2018-04-30 06:27:50 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
func rshMask64x32(v int64, s uint32) int64 {
|
2022-09-01 04:36:34 -06:00
|
|
|
// arm64:"ASR",-"AND",-"CSEL"
|
2023-09-27 11:15:04 -06:00
|
|
|
// ppc64x:"RLDICL",-"ORN",-"ISEL"
|
cmd/compile: optimize right shifts of int32 on riscv64
The compiler is currently sign extending 32 bit signed integers to
64 bits before right shifting them using a 64 bit shift instruction.
There's no need to do this as RISC-V has instructions for right
shifting 32 bit signed values (sraw and sraiw) which sign extend
the result of the shift to 64 bits. Change the compiler so that
it uses sraw and sraiw for shifts of signed 32 bit integers reducing
in most cases the number of instructions needed to perform the shift.
Here are some examples of code sequences that are changed by this
patch:
int32(a) >> 2
before:
sll x5,x10,0x20
sra x10,x5,0x22
after:
sraw x10,x10,0x2
int32(v) >> int(s)
before:
sext.w x5,x10
sltiu x6,x11,64
add x6,x6,-1
or x6,x11,x6
sra x10,x5,x6
after:
sltiu x5,x11,32
add x5,x5,-1
or x5,x11,x5
sraw x10,x10,x5
int32(v) >> (int(s) & 31)
before:
sext.w x5,x10
and x6,x11,63
sra x10,x5,x6
after:
and x5,x11,31
sraw x10,x10,x5
int32(100) >> int(a)
before:
bltz x10,<target address calls runtime.panicshift>
sltiu x5,x10,64
add x5,x5,-1
or x5,x10,x5
li x6,100
sra x10,x6,x5
after:
bltz x10,<target address calls runtime.panicshift>
sltiu x5,x10,32
add x5,x5,-1
or x5,x10,x5
li x6,100
sraw x10,x6,x5
int32(v) >> (int(s) & 63)
before:
sext.w x5,x10
and x6,x11,63
sra x10,x5,x6
after:
and x5,x11,63
sltiu x6,x5,32
add x6,x6,-1
or x5,x5,x6
sraw x10,x10,x5
In most cases we eliminate one instruction. In the case where
we shift a int32 constant by a variable the number of instructions
generated is identical. A sra is simply replaced by a sraw. In the
unusual case where we shift right by a variable anded with a constant
> 31 but < 64, we generate two additional instructions. As this is
an unusual case we do not try to optimize for it.
Some improvements can be seen in some of the existing benchmarks,
notably in the utf8 package which performs right shifts of runes
which are signed 32 bit integers.
| utf8-old | utf8-new |
| sec/op | sec/op vs base |
EncodeASCIIRune-4 17.68n ± 0% 17.67n ± 0% ~ (p=0.312 n=10)
EncodeJapaneseRune-4 35.34n ± 0% 34.53n ± 1% -2.31% (p=0.000 n=10)
AppendASCIIRune-4 3.213n ± 0% 3.213n ± 0% ~ (p=0.318 n=10)
AppendJapaneseRune-4 36.14n ± 0% 35.35n ± 0% -2.19% (p=0.000 n=10)
DecodeASCIIRune-4 28.11n ± 0% 27.36n ± 0% -2.69% (p=0.000 n=10)
DecodeJapaneseRune-4 38.55n ± 0% 38.58n ± 0% ~ (p=0.612 n=10)
Change-Id: I60a91cbede9ce65597571c7b7dd9943eeb8d3cc2
Reviewed-on: https://go-review.googlesource.com/c/go/+/535115
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Joel Sing <joel@sing.id.au>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: M Zhuo <mzh@golangcn.org>
Reviewed-by: David Chase <drchase@google.com>
2023-09-22 07:14:25 -06:00
|
|
|
// riscv64:"SRA\t",-"OR",-"SLTIU"
|
2021-09-21 10:46:00 -06:00
|
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
2019-03-13 14:52:17 -06:00
|
|
|
return v >> (s & 63)
|
2018-04-30 06:27:50 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
func lshMask64x32Ext(v int64, s int32) int64 {
|
2023-09-27 11:15:04 -06:00
|
|
|
// ppc64x:"RLDICL",-"ORN",-"ISEL"
|
2022-09-01 04:36:34 -06:00
|
|
|
// riscv64:"SLL",-"AND\t",-"SLTIU"
|
2021-09-21 10:46:00 -06:00
|
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
2018-04-30 06:27:50 -06:00
|
|
|
return v << uint(s&63)
|
|
|
|
}
|
|
|
|
|
|
|
|
func rshMask64Ux32Ext(v uint64, s int32) uint64 {
|
2023-09-27 11:15:04 -06:00
|
|
|
// ppc64x:"RLDICL",-"ORN",-"ISEL"
|
cmd/compile: optimize right shifts of uint32 on riscv
The compiler is currently zero extending 32 bit unsigned integers to
64 bits before right shifting them using a 64 bit shift instruction.
There's no need to do this as RISC-V has instructions for right
shifting 32 bit unsigned values (srlw and srliw) which zero extend
the result of the shift to 64 bits. Change the compiler so that
it uses srlw and srliw for 32 bit unsigned shifts reducing in most
cases the number of instructions needed to perform the shift.
Here are some examples of code sequences that are changed by this
patch:
uint32(a) >> 2
before:
sll x5,x10,0x20
srl x10,x5,0x22
after:
srlw x10,x10,0x2
uint32(a) >> int(b)
before:
sll x5,x10,0x20
srl x5,x5,0x20
srl x5,x5,x11
sltiu x6,x11,64
neg x6,x6
and x10,x5,x6
after:
srlw x5,x10,x11
sltiu x6,x11,32
neg x6,x6
and x10,x5,x6
bits.RotateLeft32(uint32(a), 1)
before:
sll x5,x10,0x1
sll x6,x10,0x20
srl x7,x6,0x3f
or x5,x5,x7
after:
sll x5,x10,0x1
srlw x6,x10,0x1f
or x10,x5,x6
bits.RotateLeft32(uint32(a), int(b))
before:
and x6,x11,31
sll x7,x10,x6
sll x8,x10,0x20
srl x8,x8,0x20
add x6,x6,-32
neg x6,x6
srl x9,x8,x6
sltiu x6,x6,64
neg x6,x6
and x6,x9,x6
or x6,x6,x7
after:
and x5,x11,31
sll x6,x10,x5
add x5,x5,-32
neg x5,x5
srlw x7,x10,x5
sltiu x5,x5,32
neg x5,x5
and x5,x7,x5
or x10,x6,x5
The one regression observed is the following case, an unbounded right
shift of a uint32 where the value we're shifting by is known to be
< 64 but > 31. As this is an unusual case this commit does not
optimize for it, although the existing code does.
uint32(a) >> (b & 63)
before:
sll x5,x10,0x20
srl x5,x5,0x20
and x6,x11,63
srl x10,x5,x6
after
and x5,x11,63
srlw x6,x10,x5
sltiu x5,x5,32
neg x5,x5
and x10,x6,x5
Here we have one extra instruction.
Some benchmark highlights, generated on a VisionFive2 8GB running
Ubuntu 23.04.
pkg: math/bits
LeadingZeros32-4 18.64n ± 0% 17.32n ± 0% -7.11% (p=0.000 n=10)
LeadingZeros64-4 15.47n ± 0% 15.51n ± 0% +0.26% (p=0.027 n=10)
TrailingZeros16-4 18.48n ± 0% 17.68n ± 0% -4.33% (p=0.000 n=10)
TrailingZeros32-4 16.87n ± 0% 16.07n ± 0% -4.74% (p=0.000 n=10)
TrailingZeros64-4 15.26n ± 0% 15.27n ± 0% +0.07% (p=0.043 n=10)
OnesCount32-4 20.08n ± 0% 19.29n ± 0% -3.96% (p=0.000 n=10)
RotateLeft-4 8.864n ± 0% 8.838n ± 0% -0.30% (p=0.006 n=10)
RotateLeft32-4 8.837n ± 0% 8.032n ± 0% -9.11% (p=0.000 n=10)
Reverse32-4 29.77n ± 0% 26.52n ± 0% -10.93% (p=0.000 n=10)
ReverseBytes32-4 9.640n ± 0% 8.838n ± 0% -8.32% (p=0.000 n=10)
Sub32-4 8.835n ± 0% 8.035n ± 0% -9.06% (p=0.000 n=10)
geomean 11.50n 11.33n -1.45%
pkg: crypto/md5
Hash8Bytes-4 1.486µ ± 0% 1.426µ ± 0% -4.04% (p=0.000 n=10)
Hash64-4 2.079µ ± 0% 1.968µ ± 0% -5.36% (p=0.000 n=10)
Hash128-4 2.720µ ± 0% 2.557µ ± 0% -5.99% (p=0.000 n=10)
Hash256-4 3.996µ ± 0% 3.733µ ± 0% -6.58% (p=0.000 n=10)
Hash512-4 6.541µ ± 0% 6.072µ ± 0% -7.18% (p=0.000 n=10)
Hash1K-4 11.64µ ± 0% 10.75µ ± 0% -7.58% (p=0.000 n=10)
Hash8K-4 82.95µ ± 0% 76.32µ ± 0% -7.99% (p=0.000 n=10)
Hash1M-4 10.436m ± 0% 9.591m ± 0% -8.10% (p=0.000 n=10)
Hash8M-4 83.50m ± 0% 76.73m ± 0% -8.10% (p=0.000 n=10)
Hash8BytesUnaligned-4 1.494µ ± 0% 1.434µ ± 0% -4.02% (p=0.000 n=10)
Hash1KUnaligned-4 11.64µ ± 0% 10.76µ ± 0% -7.52% (p=0.000 n=10)
Hash8KUnaligned-4 83.01µ ± 0% 76.32µ ± 0% -8.07% (p=0.000 n=10)
geomean 28.32µ 26.42µ -6.72%
Change-Id: I20483a6668cca1b53fe83944bee3706aadcf8693
Reviewed-on: https://go-review.googlesource.com/c/go/+/528975
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Gopher Robot <gobot@golang.org>
2023-09-17 05:08:55 -06:00
|
|
|
// riscv64:"SRL\t",-"AND\t",-"SLTIU"
|
2021-09-21 10:46:00 -06:00
|
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
2018-04-30 06:27:50 -06:00
|
|
|
return v >> uint(s&63)
|
|
|
|
}
|
|
|
|
|
|
|
|
func rshMask64x32Ext(v int64, s int32) int64 {
|
2023-09-27 11:15:04 -06:00
|
|
|
// ppc64x:"RLDICL",-"ORN",-"ISEL"
|
cmd/compile: optimize right shifts of int32 on riscv64
The compiler is currently sign extending 32 bit signed integers to
64 bits before right shifting them using a 64 bit shift instruction.
There's no need to do this as RISC-V has instructions for right
shifting 32 bit signed values (sraw and sraiw) which sign extend
the result of the shift to 64 bits. Change the compiler so that
it uses sraw and sraiw for shifts of signed 32 bit integers reducing
in most cases the number of instructions needed to perform the shift.
Here are some examples of code sequences that are changed by this
patch:
int32(a) >> 2
before:
sll x5,x10,0x20
sra x10,x5,0x22
after:
sraw x10,x10,0x2
int32(v) >> int(s)
before:
sext.w x5,x10
sltiu x6,x11,64
add x6,x6,-1
or x6,x11,x6
sra x10,x5,x6
after:
sltiu x5,x11,32
add x5,x5,-1
or x5,x11,x5
sraw x10,x10,x5
int32(v) >> (int(s) & 31)
before:
sext.w x5,x10
and x6,x11,63
sra x10,x5,x6
after:
and x5,x11,31
sraw x10,x10,x5
int32(100) >> int(a)
before:
bltz x10,<target address calls runtime.panicshift>
sltiu x5,x10,64
add x5,x5,-1
or x5,x10,x5
li x6,100
sra x10,x6,x5
after:
bltz x10,<target address calls runtime.panicshift>
sltiu x5,x10,32
add x5,x5,-1
or x5,x10,x5
li x6,100
sraw x10,x6,x5
int32(v) >> (int(s) & 63)
before:
sext.w x5,x10
and x6,x11,63
sra x10,x5,x6
after:
and x5,x11,63
sltiu x6,x5,32
add x6,x6,-1
or x5,x5,x6
sraw x10,x10,x5
In most cases we eliminate one instruction. In the case where
we shift a int32 constant by a variable the number of instructions
generated is identical. A sra is simply replaced by a sraw. In the
unusual case where we shift right by a variable anded with a constant
> 31 but < 64, we generate two additional instructions. As this is
an unusual case we do not try to optimize for it.
Some improvements can be seen in some of the existing benchmarks,
notably in the utf8 package which performs right shifts of runes
which are signed 32 bit integers.
| utf8-old | utf8-new |
| sec/op | sec/op vs base |
EncodeASCIIRune-4 17.68n ± 0% 17.67n ± 0% ~ (p=0.312 n=10)
EncodeJapaneseRune-4 35.34n ± 0% 34.53n ± 1% -2.31% (p=0.000 n=10)
AppendASCIIRune-4 3.213n ± 0% 3.213n ± 0% ~ (p=0.318 n=10)
AppendJapaneseRune-4 36.14n ± 0% 35.35n ± 0% -2.19% (p=0.000 n=10)
DecodeASCIIRune-4 28.11n ± 0% 27.36n ± 0% -2.69% (p=0.000 n=10)
DecodeJapaneseRune-4 38.55n ± 0% 38.58n ± 0% ~ (p=0.612 n=10)
Change-Id: I60a91cbede9ce65597571c7b7dd9943eeb8d3cc2
Reviewed-on: https://go-review.googlesource.com/c/go/+/535115
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Joel Sing <joel@sing.id.au>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: M Zhuo <mzh@golangcn.org>
Reviewed-by: David Chase <drchase@google.com>
2023-09-22 07:14:25 -06:00
|
|
|
// riscv64:"SRA\t",-"OR",-"SLTIU"
|
2021-09-21 10:46:00 -06:00
|
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
2018-04-30 06:27:50 -06:00
|
|
|
return v >> uint(s&63)
|
|
|
|
}
|
|
|
|
|
2019-03-13 14:53:38 -06:00
|
|
|
// --------------- //
|
|
|
|
// signed shifts //
|
|
|
|
// --------------- //
|
|
|
|
|
|
|
|
// We do want to generate a test + panicshift for these cases.
|
|
|
|
func lshSigned(v8 int8, v16 int16, v32 int32, v64 int64, x int) {
|
|
|
|
// amd64:"TESTB"
|
|
|
|
_ = x << v8
|
|
|
|
// amd64:"TESTW"
|
|
|
|
_ = x << v16
|
|
|
|
// amd64:"TESTL"
|
|
|
|
_ = x << v32
|
|
|
|
// amd64:"TESTQ"
|
|
|
|
_ = x << v64
|
|
|
|
}
|
|
|
|
|
|
|
|
// We want to avoid generating a test + panicshift for these cases.
|
|
|
|
func lshSignedMasked(v8 int8, v16 int16, v32 int32, v64 int64, x int) {
|
|
|
|
// amd64:-"TESTB"
|
|
|
|
_ = x << (v8 & 7)
|
|
|
|
// amd64:-"TESTW"
|
|
|
|
_ = x << (v16 & 15)
|
|
|
|
// amd64:-"TESTL"
|
|
|
|
_ = x << (v32 & 31)
|
|
|
|
// amd64:-"TESTQ"
|
|
|
|
_ = x << (v64 & 63)
|
|
|
|
}
|
|
|
|
|
2018-04-30 06:27:50 -06:00
|
|
|
// ------------------ //
|
|
|
|
// bounded shifts //
|
|
|
|
// ------------------ //
|
|
|
|
|
2021-09-21 10:46:00 -06:00
|
|
|
func lshGuarded64(v int64, s uint) int64 {
|
2018-04-30 06:27:50 -06:00
|
|
|
if s < 64 {
|
2022-08-10 10:07:40 -06:00
|
|
|
// riscv64:"SLL",-"AND",-"SLTIU"
|
2020-05-11 10:44:48 -06:00
|
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
|
|
|
// wasm:-"Select",-".*LtU"
|
2022-07-01 00:25:12 -06:00
|
|
|
// arm64:"LSL",-"CSEL"
|
2021-09-21 10:46:00 -06:00
|
|
|
return v << s
|
2018-04-30 06:27:50 -06:00
|
|
|
}
|
|
|
|
panic("shift too large")
|
|
|
|
}
|
|
|
|
|
|
|
|
func rshGuarded64U(v uint64, s uint) uint64 {
|
|
|
|
if s < 64 {
|
cmd/compile: optimize right shifts of uint32 on riscv
The compiler is currently zero extending 32 bit unsigned integers to
64 bits before right shifting them using a 64 bit shift instruction.
There's no need to do this as RISC-V has instructions for right
shifting 32 bit unsigned values (srlw and srliw) which zero extend
the result of the shift to 64 bits. Change the compiler so that
it uses srlw and srliw for 32 bit unsigned shifts reducing in most
cases the number of instructions needed to perform the shift.
Here are some examples of code sequences that are changed by this
patch:
uint32(a) >> 2
before:
sll x5,x10,0x20
srl x10,x5,0x22
after:
srlw x10,x10,0x2
uint32(a) >> int(b)
before:
sll x5,x10,0x20
srl x5,x5,0x20
srl x5,x5,x11
sltiu x6,x11,64
neg x6,x6
and x10,x5,x6
after:
srlw x5,x10,x11
sltiu x6,x11,32
neg x6,x6
and x10,x5,x6
bits.RotateLeft32(uint32(a), 1)
before:
sll x5,x10,0x1
sll x6,x10,0x20
srl x7,x6,0x3f
or x5,x5,x7
after:
sll x5,x10,0x1
srlw x6,x10,0x1f
or x10,x5,x6
bits.RotateLeft32(uint32(a), int(b))
before:
and x6,x11,31
sll x7,x10,x6
sll x8,x10,0x20
srl x8,x8,0x20
add x6,x6,-32
neg x6,x6
srl x9,x8,x6
sltiu x6,x6,64
neg x6,x6
and x6,x9,x6
or x6,x6,x7
after:
and x5,x11,31
sll x6,x10,x5
add x5,x5,-32
neg x5,x5
srlw x7,x10,x5
sltiu x5,x5,32
neg x5,x5
and x5,x7,x5
or x10,x6,x5
The one regression observed is the following case, an unbounded right
shift of a uint32 where the value we're shifting by is known to be
< 64 but > 31. As this is an unusual case this commit does not
optimize for it, although the existing code does.
uint32(a) >> (b & 63)
before:
sll x5,x10,0x20
srl x5,x5,0x20
and x6,x11,63
srl x10,x5,x6
after
and x5,x11,63
srlw x6,x10,x5
sltiu x5,x5,32
neg x5,x5
and x10,x6,x5
Here we have one extra instruction.
Some benchmark highlights, generated on a VisionFive2 8GB running
Ubuntu 23.04.
pkg: math/bits
LeadingZeros32-4 18.64n ± 0% 17.32n ± 0% -7.11% (p=0.000 n=10)
LeadingZeros64-4 15.47n ± 0% 15.51n ± 0% +0.26% (p=0.027 n=10)
TrailingZeros16-4 18.48n ± 0% 17.68n ± 0% -4.33% (p=0.000 n=10)
TrailingZeros32-4 16.87n ± 0% 16.07n ± 0% -4.74% (p=0.000 n=10)
TrailingZeros64-4 15.26n ± 0% 15.27n ± 0% +0.07% (p=0.043 n=10)
OnesCount32-4 20.08n ± 0% 19.29n ± 0% -3.96% (p=0.000 n=10)
RotateLeft-4 8.864n ± 0% 8.838n ± 0% -0.30% (p=0.006 n=10)
RotateLeft32-4 8.837n ± 0% 8.032n ± 0% -9.11% (p=0.000 n=10)
Reverse32-4 29.77n ± 0% 26.52n ± 0% -10.93% (p=0.000 n=10)
ReverseBytes32-4 9.640n ± 0% 8.838n ± 0% -8.32% (p=0.000 n=10)
Sub32-4 8.835n ± 0% 8.035n ± 0% -9.06% (p=0.000 n=10)
geomean 11.50n 11.33n -1.45%
pkg: crypto/md5
Hash8Bytes-4 1.486µ ± 0% 1.426µ ± 0% -4.04% (p=0.000 n=10)
Hash64-4 2.079µ ± 0% 1.968µ ± 0% -5.36% (p=0.000 n=10)
Hash128-4 2.720µ ± 0% 2.557µ ± 0% -5.99% (p=0.000 n=10)
Hash256-4 3.996µ ± 0% 3.733µ ± 0% -6.58% (p=0.000 n=10)
Hash512-4 6.541µ ± 0% 6.072µ ± 0% -7.18% (p=0.000 n=10)
Hash1K-4 11.64µ ± 0% 10.75µ ± 0% -7.58% (p=0.000 n=10)
Hash8K-4 82.95µ ± 0% 76.32µ ± 0% -7.99% (p=0.000 n=10)
Hash1M-4 10.436m ± 0% 9.591m ± 0% -8.10% (p=0.000 n=10)
Hash8M-4 83.50m ± 0% 76.73m ± 0% -8.10% (p=0.000 n=10)
Hash8BytesUnaligned-4 1.494µ ± 0% 1.434µ ± 0% -4.02% (p=0.000 n=10)
Hash1KUnaligned-4 11.64µ ± 0% 10.76µ ± 0% -7.52% (p=0.000 n=10)
Hash8KUnaligned-4 83.01µ ± 0% 76.32µ ± 0% -8.07% (p=0.000 n=10)
geomean 28.32µ 26.42µ -6.72%
Change-Id: I20483a6668cca1b53fe83944bee3706aadcf8693
Reviewed-on: https://go-review.googlesource.com/c/go/+/528975
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Gopher Robot <gobot@golang.org>
2023-09-17 05:08:55 -06:00
|
|
|
// riscv64:"SRL\t",-"AND",-"SLTIU"
|
2020-05-11 10:44:48 -06:00
|
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
|
|
|
// wasm:-"Select",-".*LtU"
|
2022-07-01 00:25:12 -06:00
|
|
|
// arm64:"LSR",-"CSEL"
|
2018-04-30 06:27:50 -06:00
|
|
|
return v >> s
|
|
|
|
}
|
|
|
|
panic("shift too large")
|
|
|
|
}
|
|
|
|
|
2021-09-21 10:46:00 -06:00
|
|
|
func rshGuarded64(v int64, s uint) int64 {
|
2018-04-30 06:27:50 -06:00
|
|
|
if s < 64 {
|
cmd/compile: optimize right shifts of int32 on riscv64
The compiler is currently sign extending 32 bit signed integers to
64 bits before right shifting them using a 64 bit shift instruction.
There's no need to do this as RISC-V has instructions for right
shifting 32 bit signed values (sraw and sraiw) which sign extend
the result of the shift to 64 bits. Change the compiler so that
it uses sraw and sraiw for shifts of signed 32 bit integers reducing
in most cases the number of instructions needed to perform the shift.
Here are some examples of code sequences that are changed by this
patch:
int32(a) >> 2
before:
sll x5,x10,0x20
sra x10,x5,0x22
after:
sraw x10,x10,0x2
int32(v) >> int(s)
before:
sext.w x5,x10
sltiu x6,x11,64
add x6,x6,-1
or x6,x11,x6
sra x10,x5,x6
after:
sltiu x5,x11,32
add x5,x5,-1
or x5,x11,x5
sraw x10,x10,x5
int32(v) >> (int(s) & 31)
before:
sext.w x5,x10
and x6,x11,63
sra x10,x5,x6
after:
and x5,x11,31
sraw x10,x10,x5
int32(100) >> int(a)
before:
bltz x10,<target address calls runtime.panicshift>
sltiu x5,x10,64
add x5,x5,-1
or x5,x10,x5
li x6,100
sra x10,x6,x5
after:
bltz x10,<target address calls runtime.panicshift>
sltiu x5,x10,32
add x5,x5,-1
or x5,x10,x5
li x6,100
sraw x10,x6,x5
int32(v) >> (int(s) & 63)
before:
sext.w x5,x10
and x6,x11,63
sra x10,x5,x6
after:
and x5,x11,63
sltiu x6,x5,32
add x6,x6,-1
or x5,x5,x6
sraw x10,x10,x5
In most cases we eliminate one instruction. In the case where
we shift a int32 constant by a variable the number of instructions
generated is identical. A sra is simply replaced by a sraw. In the
unusual case where we shift right by a variable anded with a constant
> 31 but < 64, we generate two additional instructions. As this is
an unusual case we do not try to optimize for it.
Some improvements can be seen in some of the existing benchmarks,
notably in the utf8 package which performs right shifts of runes
which are signed 32 bit integers.
| utf8-old | utf8-new |
| sec/op | sec/op vs base |
EncodeASCIIRune-4 17.68n ± 0% 17.67n ± 0% ~ (p=0.312 n=10)
EncodeJapaneseRune-4 35.34n ± 0% 34.53n ± 1% -2.31% (p=0.000 n=10)
AppendASCIIRune-4 3.213n ± 0% 3.213n ± 0% ~ (p=0.318 n=10)
AppendJapaneseRune-4 36.14n ± 0% 35.35n ± 0% -2.19% (p=0.000 n=10)
DecodeASCIIRune-4 28.11n ± 0% 27.36n ± 0% -2.69% (p=0.000 n=10)
DecodeJapaneseRune-4 38.55n ± 0% 38.58n ± 0% ~ (p=0.612 n=10)
Change-Id: I60a91cbede9ce65597571c7b7dd9943eeb8d3cc2
Reviewed-on: https://go-review.googlesource.com/c/go/+/535115
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Joel Sing <joel@sing.id.au>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: M Zhuo <mzh@golangcn.org>
Reviewed-by: David Chase <drchase@google.com>
2023-09-22 07:14:25 -06:00
|
|
|
// riscv64:"SRA\t",-"OR",-"SLTIU"
|
2020-05-11 10:44:48 -06:00
|
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
|
|
|
// wasm:-"Select",-".*LtU"
|
2022-07-01 00:25:12 -06:00
|
|
|
// arm64:"ASR",-"CSEL"
|
2021-09-21 10:46:00 -06:00
|
|
|
return v >> s
|
2018-04-30 06:27:50 -06:00
|
|
|
}
|
|
|
|
panic("shift too large")
|
|
|
|
}
|
2020-03-26 14:01:40 -06:00
|
|
|
|
2022-07-01 00:25:12 -06:00
|
|
|
func provedUnsignedShiftLeft(val64 uint64, val32 uint32, val16 uint16, val8 uint8, shift int) (r1 uint64, r2 uint32, r3 uint16, r4 uint8) {
|
|
|
|
if shift >= 0 && shift < 64 {
|
|
|
|
// arm64:"LSL",-"CSEL"
|
|
|
|
r1 = val64 << shift
|
|
|
|
}
|
|
|
|
if shift >= 0 && shift < 32 {
|
|
|
|
// arm64:"LSL",-"CSEL"
|
|
|
|
r2 = val32 << shift
|
|
|
|
}
|
|
|
|
if shift >= 0 && shift < 16 {
|
|
|
|
// arm64:"LSL",-"CSEL"
|
|
|
|
r3 = val16 << shift
|
|
|
|
}
|
|
|
|
if shift >= 0 && shift < 8 {
|
|
|
|
// arm64:"LSL",-"CSEL"
|
|
|
|
r4 = val8 << shift
|
|
|
|
}
|
|
|
|
return r1, r2, r3, r4
|
|
|
|
}
|
|
|
|
|
|
|
|
func provedSignedShiftLeft(val64 int64, val32 int32, val16 int16, val8 int8, shift int) (r1 int64, r2 int32, r3 int16, r4 int8) {
|
|
|
|
if shift >= 0 && shift < 64 {
|
|
|
|
// arm64:"LSL",-"CSEL"
|
|
|
|
r1 = val64 << shift
|
|
|
|
}
|
|
|
|
if shift >= 0 && shift < 32 {
|
|
|
|
// arm64:"LSL",-"CSEL"
|
|
|
|
r2 = val32 << shift
|
|
|
|
}
|
|
|
|
if shift >= 0 && shift < 16 {
|
|
|
|
// arm64:"LSL",-"CSEL"
|
|
|
|
r3 = val16 << shift
|
|
|
|
}
|
|
|
|
if shift >= 0 && shift < 8 {
|
|
|
|
// arm64:"LSL",-"CSEL"
|
|
|
|
r4 = val8 << shift
|
|
|
|
}
|
|
|
|
return r1, r2, r3, r4
|
|
|
|
}
|
|
|
|
|
|
|
|
func provedUnsignedShiftRight(val64 uint64, val32 uint32, val16 uint16, val8 uint8, shift int) (r1 uint64, r2 uint32, r3 uint16, r4 uint8) {
|
|
|
|
if shift >= 0 && shift < 64 {
|
|
|
|
// arm64:"LSR",-"CSEL"
|
|
|
|
r1 = val64 >> shift
|
|
|
|
}
|
|
|
|
if shift >= 0 && shift < 32 {
|
|
|
|
// arm64:"LSR",-"CSEL"
|
|
|
|
r2 = val32 >> shift
|
|
|
|
}
|
|
|
|
if shift >= 0 && shift < 16 {
|
|
|
|
// arm64:"LSR",-"CSEL"
|
|
|
|
r3 = val16 >> shift
|
|
|
|
}
|
|
|
|
if shift >= 0 && shift < 8 {
|
|
|
|
// arm64:"LSR",-"CSEL"
|
|
|
|
r4 = val8 >> shift
|
|
|
|
}
|
|
|
|
return r1, r2, r3, r4
|
|
|
|
}
|
|
|
|
|
|
|
|
func provedSignedShiftRight(val64 int64, val32 int32, val16 int16, val8 int8, shift int) (r1 int64, r2 int32, r3 int16, r4 int8) {
|
|
|
|
if shift >= 0 && shift < 64 {
|
|
|
|
// arm64:"ASR",-"CSEL"
|
|
|
|
r1 = val64 >> shift
|
|
|
|
}
|
|
|
|
if shift >= 0 && shift < 32 {
|
|
|
|
// arm64:"ASR",-"CSEL"
|
|
|
|
r2 = val32 >> shift
|
|
|
|
}
|
|
|
|
if shift >= 0 && shift < 16 {
|
|
|
|
// arm64:"ASR",-"CSEL"
|
|
|
|
r3 = val16 >> shift
|
|
|
|
}
|
|
|
|
if shift >= 0 && shift < 8 {
|
|
|
|
// arm64:"ASR",-"CSEL"
|
|
|
|
r4 = val8 >> shift
|
|
|
|
}
|
|
|
|
return r1, r2, r3, r4
|
|
|
|
}
|
|
|
|
|
2020-08-31 07:43:40 -06:00
|
|
|
func checkUnneededTrunc(tab *[100000]uint32, d uint64, v uint32, h uint16, b byte) (uint32, uint64) {
|
|
|
|
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
|
2020-08-31 07:43:40 -06:00
|
|
|
f := tab[byte(v)^b]
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
|
2020-08-31 07:43:40 -06:00
|
|
|
f += tab[byte(v)&b]
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
|
2020-08-31 07:43:40 -06:00
|
|
|
f += tab[byte(v)|b]
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
|
2020-08-31 07:43:40 -06:00
|
|
|
f += tab[uint16(v)&h]
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
|
2020-08-31 07:43:40 -06:00
|
|
|
f += tab[uint16(v)^h]
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
|
2020-08-31 07:43:40 -06:00
|
|
|
f += tab[uint16(v)|h]
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:-".*AND",-"RLDICR",".*CLRLSLDI"
|
2020-08-31 07:43:40 -06:00
|
|
|
f += tab[v&0xff]
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:-".*AND",".*CLRLSLWI"
|
2020-10-23 11:12:34 -06:00
|
|
|
f += 2 * uint32(uint16(d))
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:-".*AND",-"RLDICR",".*CLRLSLDI"
|
2020-10-23 11:12:34 -06:00
|
|
|
g := 2 * uint64(uint32(d))
|
2020-08-31 07:43:40 -06:00
|
|
|
return f, g
|
|
|
|
}
|
|
|
|
|
2020-09-23 09:06:39 -06:00
|
|
|
func checkCombinedShifts(v8 uint8, v16 uint16, v32 uint32, x32 int32, v64 uint64) (uint8, uint16, uint32, uint64, int64) {
|
2020-08-31 07:43:40 -06:00
|
|
|
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:-"AND","CLRLSLWI"
|
2020-10-23 11:12:34 -06:00
|
|
|
f := (v8 & 0xF) << 2
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"CLRLSLWI"
|
2020-10-23 11:12:34 -06:00
|
|
|
f += byte(v16) << 3
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:-"AND","CLRLSLWI"
|
2020-08-31 07:43:40 -06:00
|
|
|
g := (v16 & 0xFF) << 3
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:-"AND","CLRLSLWI"
|
2020-08-31 07:43:40 -06:00
|
|
|
h := (v32 & 0xFFFFF) << 2
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:"CLRLSLDI"
|
2020-08-31 07:43:40 -06:00
|
|
|
i := (v64 & 0xFFFFFFFF) << 5
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x:-"CLRLSLDI"
|
2020-09-28 16:20:12 -06:00
|
|
|
i += (v64 & 0xFFFFFFF) << 38
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x/power9:-"CLRLSLDI"
|
2020-09-28 16:20:12 -06:00
|
|
|
i += (v64 & 0xFFFF00) << 10
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x/power9:-"SLD","EXTSWSLI"
|
2020-10-23 11:12:34 -06:00
|
|
|
j := int64(x32+32) * 8
|
2020-09-23 09:06:39 -06:00
|
|
|
return f, g, h, i, j
|
2020-08-31 07:43:40 -06:00
|
|
|
}
|
|
|
|
|
2020-03-26 14:01:40 -06:00
|
|
|
func checkWidenAfterShift(v int64, u uint64) (int64, uint64) {
|
|
|
|
|
2023-01-24 10:38:29 -07:00
|
|
|
// ppc64x:-".*MOVW"
|
2020-10-23 11:12:34 -06:00
|
|
|
f := int32(v >> 32)
|
2023-01-24 10:38:29 -07:00
|
|
|
// ppc64x:".*MOVW"
|
2020-10-23 11:12:34 -06:00
|
|
|
f += int32(v >> 31)
|
2023-01-24 10:38:29 -07:00
|
|
|
// ppc64x:-".*MOVH"
|
2020-10-23 11:12:34 -06:00
|
|
|
g := int16(v >> 48)
|
2023-01-24 10:38:29 -07:00
|
|
|
// ppc64x:".*MOVH"
|
2020-10-23 11:12:34 -06:00
|
|
|
g += int16(v >> 30)
|
2023-01-24 10:38:29 -07:00
|
|
|
// ppc64x:-".*MOVH"
|
2020-10-23 11:12:34 -06:00
|
|
|
g += int16(f >> 16)
|
2023-01-24 10:38:29 -07:00
|
|
|
// ppc64x:-".*MOVB"
|
2020-10-23 11:12:34 -06:00
|
|
|
h := int8(v >> 56)
|
2023-01-24 10:38:29 -07:00
|
|
|
// ppc64x:".*MOVB"
|
2020-10-23 11:12:34 -06:00
|
|
|
h += int8(v >> 28)
|
2023-01-24 10:38:29 -07:00
|
|
|
// ppc64x:-".*MOVB"
|
2020-10-23 11:12:34 -06:00
|
|
|
h += int8(f >> 24)
|
2023-01-24 10:38:29 -07:00
|
|
|
// ppc64x:".*MOVB"
|
2020-10-23 11:12:34 -06:00
|
|
|
h += int8(f >> 16)
|
|
|
|
return int64(h), uint64(g)
|
|
|
|
}
|
|
|
|
|
|
|
|
func checkShiftAndMask32(v []uint32) {
|
|
|
|
i := 0
|
|
|
|
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x: "RLWNM\t[$]24, R[0-9]+, [$]12, [$]19, R[0-9]+"
|
2020-10-23 11:12:34 -06:00
|
|
|
v[i] = (v[i] & 0xFF00000) >> 8
|
|
|
|
i++
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x: "RLWNM\t[$]26, R[0-9]+, [$]22, [$]29, R[0-9]+"
|
2020-10-23 11:12:34 -06:00
|
|
|
v[i] = (v[i] & 0xFF00) >> 6
|
|
|
|
i++
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x: "MOVW\tR0"
|
2020-10-23 11:12:34 -06:00
|
|
|
v[i] = (v[i] & 0xFF) >> 8
|
|
|
|
i++
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x: "MOVW\tR0"
|
2020-10-23 11:12:34 -06:00
|
|
|
v[i] = (v[i] & 0xF000000) >> 28
|
|
|
|
i++
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x: "RLWNM\t[$]26, R[0-9]+, [$]24, [$]31, R[0-9]+"
|
2020-10-23 11:12:34 -06:00
|
|
|
v[i] = (v[i] >> 6) & 0xFF
|
|
|
|
i++
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x: "RLWNM\t[$]26, R[0-9]+, [$]12, [$]19, R[0-9]+"
|
2020-10-23 11:12:34 -06:00
|
|
|
v[i] = (v[i] >> 6) & 0xFF000
|
|
|
|
i++
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x: "MOVW\tR0"
|
2020-10-23 11:12:34 -06:00
|
|
|
v[i] = (v[i] >> 20) & 0xFF000
|
|
|
|
i++
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x: "MOVW\tR0"
|
2020-10-23 11:12:34 -06:00
|
|
|
v[i] = (v[i] >> 24) & 0xFF00
|
|
|
|
i++
|
|
|
|
}
|
|
|
|
|
|
|
|
func checkMergedShifts32(a [256]uint32, b [256]uint64, u uint32, v uint32) {
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x: -"CLRLSLDI", "RLWNM\t[$]10, R[0-9]+, [$]22, [$]29, R[0-9]+"
|
2020-10-23 11:12:34 -06:00
|
|
|
a[0] = a[uint8(v>>24)]
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x: -"CLRLSLDI", "RLWNM\t[$]11, R[0-9]+, [$]21, [$]28, R[0-9]+"
|
2020-10-23 11:12:34 -06:00
|
|
|
b[0] = b[uint8(v>>24)]
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x: -"CLRLSLDI", "RLWNM\t[$]15, R[0-9]+, [$]21, [$]28, R[0-9]+"
|
2020-10-23 11:12:34 -06:00
|
|
|
b[1] = b[(v>>20)&0xFF]
|
2023-01-25 10:53:10 -07:00
|
|
|
// ppc64x: -"SLD", "RLWNM\t[$]10, R[0-9]+, [$]22, [$]28, R[0-9]+"
|
2020-10-23 11:12:34 -06:00
|
|
|
b[2] = b[v>>25]
|
2020-03-26 14:01:40 -06:00
|
|
|
}
|
2021-01-07 20:25:05 -07:00
|
|
|
|
2024-05-20 13:44:21 -06:00
|
|
|
func checkMergedShifts64(a [256]uint32, b [256]uint64, c [256]byte, v uint64) {
|
2024-04-26 08:26:52 -06:00
|
|
|
// ppc64x: -"CLRLSLDI", "RLWNM\t[$]10, R[0-9]+, [$]22, [$]29, R[0-9]+"
|
|
|
|
a[0] = a[uint8(v>>24)]
|
|
|
|
// ppc64x: "SRD", "CLRLSLDI", -"RLWNM"
|
|
|
|
a[1] = a[uint8(v>>25)]
|
|
|
|
// ppc64x: -"CLRLSLDI", "RLWNM\t[$]9, R[0-9]+, [$]23, [$]29, R[0-9]+"
|
|
|
|
a[2] = a[v>>25&0x7F]
|
|
|
|
// ppc64x: -"CLRLSLDI", "RLWNM\t[$]3, R[0-9]+, [$]29, [$]29, R[0-9]+"
|
|
|
|
a[3] = a[(v>>31)&0x01]
|
|
|
|
// ppc64x: "SRD", "CLRLSLDI", -"RLWNM"
|
|
|
|
a[4] = a[(v>>30)&0x07]
|
|
|
|
// ppc64x: "SRD", "CLRLSLDI", -"RLWNM"
|
|
|
|
a[5] = a[(v>>32)&0x01]
|
|
|
|
// ppc64x: "SRD", "CLRLSLDI", -"RLWNM"
|
2024-05-01 14:03:34 -06:00
|
|
|
a[6] = a[(v>>34)&0x03]
|
2024-04-26 08:26:52 -06:00
|
|
|
// ppc64x: -"CLRLSLDI", "RLWNM\t[$]12, R[0-9]+, [$]21, [$]28, R[0-9]+"
|
|
|
|
b[0] = b[uint8(v>>23)]
|
|
|
|
// ppc64x: -"CLRLSLDI", "RLWNM\t[$]15, R[0-9]+, [$]21, [$]28, R[0-9]+"
|
|
|
|
b[1] = b[(v>>20)&0xFF]
|
2024-05-01 14:03:34 -06:00
|
|
|
// ppc64x: "RLWNM", -"SLD"
|
|
|
|
b[2] = b[((uint64((uint32(v) >> 21)) & 0x3f) << 4)]
|
2024-05-20 13:44:21 -06:00
|
|
|
// ppc64x: "RLWNM\t[$]11, R[0-9]+, [$]10, [$]15"
|
|
|
|
c[0] = c[((v>>5)&0x3F)<<16]
|
|
|
|
// ppc64x: "RLWNM\t[$]0, R[0-9]+, [$]19, [$]24"
|
|
|
|
c[1] = c[((v>>7)&0x3F)<<7]
|
2024-05-01 14:03:34 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
func checkShiftMask(a uint32, b uint64, z []uint32, y []uint64) {
|
|
|
|
_ = y[128]
|
|
|
|
_ = z[128]
|
|
|
|
// ppc64x: -"MOVBZ", -"SRW", "RLWNM"
|
|
|
|
z[0] = uint32(uint8(a >> 5))
|
|
|
|
// ppc64x: -"MOVBZ", -"SRW", "RLWNM"
|
|
|
|
z[1] = uint32(uint8((a >> 4) & 0x7e))
|
|
|
|
// ppc64x: "RLWNM\t[$]25, R[0-9]+, [$]27, [$]29, R[0-9]+"
|
|
|
|
z[2] = uint32(uint8(a>>7)) & 0x1c
|
|
|
|
// ppc64x: -"MOVWZ"
|
|
|
|
y[0] = uint64((a >> 6) & 0x1c)
|
|
|
|
// ppc64x: -"MOVWZ"
|
|
|
|
y[1] = uint64(uint32(b)<<6) + 1
|
|
|
|
// ppc64x: -"MOVHZ", -"MOVWZ"
|
|
|
|
y[2] = uint64((uint16(a) >> 9) & 0x1F)
|
|
|
|
// ppc64x: -"MOVHZ", -"MOVWZ", -"ANDCC"
|
|
|
|
y[3] = uint64(((uint16(a) & 0xFF0) >> 9) & 0x1F)
|
2024-04-26 08:26:52 -06:00
|
|
|
}
|
|
|
|
|
2021-01-07 20:25:05 -07:00
|
|
|
// 128 bit shifts
|
|
|
|
|
|
|
|
func check128bitShifts(x, y uint64, bits uint) (uint64, uint64) {
|
|
|
|
s := bits & 63
|
|
|
|
ŝ := (64 - bits) & 63
|
|
|
|
// check that the shift operation has two commas (three operands)
|
|
|
|
// amd64:"SHRQ.*,.*,"
|
|
|
|
shr := x>>s | y<<ŝ
|
|
|
|
// amd64:"SHLQ.*,.*,"
|
|
|
|
shl := x<<s | y>>ŝ
|
|
|
|
return shr, shl
|
|
|
|
}
|
2022-08-17 14:09:12 -06:00
|
|
|
|
|
|
|
func checkShiftToMask(u []uint64, s []int64) {
|
|
|
|
// amd64:-"SHR",-"SHL","ANDQ"
|
|
|
|
u[0] = u[0] >> 5 << 5
|
|
|
|
// amd64:-"SAR",-"SHL","ANDQ"
|
|
|
|
s[0] = s[0] >> 5 << 5
|
|
|
|
// amd64:-"SHR",-"SHL","ANDQ"
|
|
|
|
u[1] = u[1] << 5 >> 5
|
|
|
|
}
|