mirror of
https://github.com/golang/go
synced 2024-11-07 16:36:24 -07:00
43d5f213e2
amd64 can shift in bits from another register instead of filling with 0/1. This pattern is helpful when implementing 128 bit shifts or arbitrary length shifts. In the standard library, it shows up in pure Go math/big. Benchmarks results on amd64 with -tags=math_big_pure_go. name old time/op new time/op delta NonZeroShifts/1/shrVU-8 4.45ns ± 3% 4.39ns ± 1% -1.28% (p=0.000 n=30+27) NonZeroShifts/1/shlVU-8 4.13ns ± 4% 4.10ns ± 2% ~ (p=0.254 n=29+28) NonZeroShifts/2/shrVU-8 5.55ns ± 1% 5.63ns ± 2% +1.42% (p=0.000 n=28+29) NonZeroShifts/2/shlVU-8 5.70ns ± 2% 5.14ns ± 1% -9.82% (p=0.000 n=29+28) NonZeroShifts/3/shrVU-8 6.79ns ± 2% 6.35ns ± 2% -6.46% (p=0.000 n=28+29) NonZeroShifts/3/shlVU-8 6.69ns ± 1% 6.25ns ± 1% -6.60% (p=0.000 n=28+27) NonZeroShifts/4/shrVU-8 7.79ns ± 2% 7.06ns ± 2% -9.48% (p=0.000 n=30+30) NonZeroShifts/4/shlVU-8 7.82ns ± 1% 7.24ns ± 1% -7.37% (p=0.000 n=28+29) NonZeroShifts/5/shrVU-8 8.90ns ± 3% 7.93ns ± 1% -10.84% (p=0.000 n=29+26) NonZeroShifts/5/shlVU-8 8.68ns ± 1% 7.92ns ± 1% -8.76% (p=0.000 n=29+29) NonZeroShifts/10/shrVU-8 14.4ns ± 1% 12.3ns ± 2% -14.79% (p=0.000 n=28+29) NonZeroShifts/10/shlVU-8 14.1ns ± 1% 11.9ns ± 2% -15.55% (p=0.000 n=28+27) NonZeroShifts/100/shrVU-8 118ns ± 1% 96ns ± 3% -18.82% (p=0.000 n=30+29) NonZeroShifts/100/shlVU-8 120ns ± 2% 98ns ± 2% -18.46% (p=0.000 n=29+28) NonZeroShifts/1000/shrVU-8 1.10µs ± 1% 0.88µs ± 2% -19.63% (p=0.000 n=29+30) NonZeroShifts/1000/shlVU-8 1.10µs ± 2% 0.88µs ± 2% -20.28% (p=0.000 n=29+28) NonZeroShifts/10000/shrVU-8 10.9µs ± 1% 8.7µs ± 1% -19.78% (p=0.000 n=28+27) NonZeroShifts/10000/shlVU-8 10.9µs ± 2% 8.7µs ± 1% -19.64% (p=0.000 n=29+27) NonZeroShifts/100000/shrVU-8 111µs ± 2% 90µs ± 2% -19.39% (p=0.000 n=28+29) NonZeroShifts/100000/shlVU-8 113µs ± 2% 90µs ± 2% -20.43% (p=0.000 n=30+27) The assembly version is still faster, unfortunately, but the gap is narrowing. Speedup from pure Go to assembly: name old time/op new time/op delta NonZeroShifts/1/shrVU-8 4.39ns ± 1% 3.45ns ± 2% -21.36% (p=0.000 n=27+29) NonZeroShifts/1/shlVU-8 4.10ns ± 2% 3.47ns ± 3% -15.42% (p=0.000 n=28+30) NonZeroShifts/2/shrVU-8 5.63ns ± 2% 3.97ns ± 0% -29.40% (p=0.000 n=29+25) NonZeroShifts/2/shlVU-8 5.14ns ± 1% 3.77ns ± 2% -26.65% (p=0.000 n=28+26) NonZeroShifts/3/shrVU-8 6.35ns ± 2% 4.79ns ± 2% -24.52% (p=0.000 n=29+29) NonZeroShifts/3/shlVU-8 6.25ns ± 1% 4.42ns ± 1% -29.29% (p=0.000 n=27+26) NonZeroShifts/4/shrVU-8 7.06ns ± 2% 5.64ns ± 1% -20.05% (p=0.000 n=30+29) NonZeroShifts/4/shlVU-8 7.24ns ± 1% 5.34ns ± 2% -26.23% (p=0.000 n=29+29) NonZeroShifts/5/shrVU-8 7.93ns ± 1% 6.56ns ± 2% -17.26% (p=0.000 n=26+30) NonZeroShifts/5/shlVU-8 7.92ns ± 1% 6.27ns ± 1% -20.79% (p=0.000 n=29+25) NonZeroShifts/10/shrVU-8 12.3ns ± 2% 10.2ns ± 2% -17.21% (p=0.000 n=29+29) NonZeroShifts/10/shlVU-8 11.9ns ± 2% 10.5ns ± 2% -12.45% (p=0.000 n=27+29) NonZeroShifts/100/shrVU-8 95.9ns ± 3% 77.7ns ± 1% -19.00% (p=0.000 n=29+30) NonZeroShifts/100/shlVU-8 97.5ns ± 2% 66.8ns ± 2% -31.47% (p=0.000 n=28+30) NonZeroShifts/1000/shrVU-8 884ns ± 2% 705ns ± 1% -20.17% (p=0.000 n=30+28) NonZeroShifts/1000/shlVU-8 880ns ± 2% 590ns ± 1% -32.96% (p=0.000 n=28+25) NonZeroShifts/10000/shrVU-8 8.74µs ± 1% 7.34µs ± 3% -15.94% (p=0.000 n=27+30) NonZeroShifts/10000/shlVU-8 8.73µs ± 1% 6.00µs ± 1% -31.25% (p=0.000 n=27+28) NonZeroShifts/100000/shrVU-8 89.6µs ± 2% 75.5µs ± 2% -15.80% (p=0.000 n=29+29) NonZeroShifts/100000/shlVU-8 89.6µs ± 2% 68.0µs ± 3% -24.09% (p=0.000 n=27+30) Change-Id: I18f58d8f5513d737d9cdf09b8f9d14011ffe3958 Reviewed-on: https://go-review.googlesource.com/c/go/+/297050 Trust: Josh Bleecher Snyder <josharian@gmail.com> Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
304 lines
7.7 KiB
Go
304 lines
7.7 KiB
Go
// asmcheck
|
|
|
|
// Copyright 2018 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package codegen
|
|
|
|
// ------------------ //
|
|
// masked shifts //
|
|
// ------------------ //
|
|
|
|
func lshMask64x64(v int64, s uint64) int64 {
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
|
// ppc64le:"ANDCC",-"ORN",-"ISEL"
|
|
// ppc64:"ANDCC",-"ORN",-"ISEL"
|
|
return v << (s & 63)
|
|
}
|
|
|
|
func rshMask64Ux64(v uint64, s uint64) uint64 {
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
|
// ppc64le:"ANDCC",-"ORN",-"ISEL"
|
|
// ppc64:"ANDCC",-"ORN",-"ISEL"
|
|
return v >> (s & 63)
|
|
}
|
|
|
|
func rshMask64x64(v int64, s uint64) int64 {
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
|
// ppc64le:"ANDCC",-ORN",-"ISEL"
|
|
// ppc64:"ANDCC",-"ORN",-"ISEL"
|
|
return v >> (s & 63)
|
|
}
|
|
|
|
func lshMask32x64(v int32, s uint64) int32 {
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
|
// ppc64le:"ISEL",-"ORN"
|
|
// ppc64:"ISEL",-"ORN"
|
|
return v << (s & 63)
|
|
}
|
|
|
|
func rshMask32Ux64(v uint32, s uint64) uint32 {
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
|
// ppc64le:"ISEL",-"ORN"
|
|
// ppc64:"ISEL",-"ORN"
|
|
return v >> (s & 63)
|
|
}
|
|
|
|
func rshMask32x64(v int32, s uint64) int32 {
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
|
// ppc64le:"ISEL",-"ORN"
|
|
// ppc64:"ISEL",-"ORN"
|
|
return v >> (s & 63)
|
|
}
|
|
|
|
func lshMask64x32(v int64, s uint32) int64 {
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
|
// ppc64le:"ANDCC",-"ORN"
|
|
// ppc64:"ANDCC",-"ORN"
|
|
return v << (s & 63)
|
|
}
|
|
|
|
func rshMask64Ux32(v uint64, s uint32) uint64 {
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
|
// ppc64le:"ANDCC",-"ORN"
|
|
// ppc64:"ANDCC",-"ORN"
|
|
return v >> (s & 63)
|
|
}
|
|
|
|
func rshMask64x32(v int64, s uint32) int64 {
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
|
// ppc64le:"ANDCC",-"ORN",-"ISEL"
|
|
// ppc64:"ANDCC",-"ORN",-"ISEL"
|
|
return v >> (s & 63)
|
|
}
|
|
|
|
func lshMask64x32Ext(v int64, s int32) int64 {
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
|
// ppc64le:"ANDCC",-"ORN",-"ISEL"
|
|
// ppc64:"ANDCC",-"ORN",-"ISEL"
|
|
return v << uint(s&63)
|
|
}
|
|
|
|
func rshMask64Ux32Ext(v uint64, s int32) uint64 {
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
|
// ppc64le:"ANDCC",-"ORN",-"ISEL"
|
|
// ppc64:"ANDCC",-"ORN",-"ISEL"
|
|
return v >> uint(s&63)
|
|
}
|
|
|
|
func rshMask64x32Ext(v int64, s int32) int64 {
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
|
// ppc64le:"ANDCC",-"ORN",-"ISEL"
|
|
// ppc64:"ANDCC",-"ORN",-"ISEL"
|
|
return v >> uint(s&63)
|
|
}
|
|
|
|
// --------------- //
|
|
// signed shifts //
|
|
// --------------- //
|
|
|
|
// We do want to generate a test + panicshift for these cases.
|
|
func lshSigned(v8 int8, v16 int16, v32 int32, v64 int64, x int) {
|
|
// amd64:"TESTB"
|
|
_ = x << v8
|
|
// amd64:"TESTW"
|
|
_ = x << v16
|
|
// amd64:"TESTL"
|
|
_ = x << v32
|
|
// amd64:"TESTQ"
|
|
_ = x << v64
|
|
}
|
|
|
|
// We want to avoid generating a test + panicshift for these cases.
|
|
func lshSignedMasked(v8 int8, v16 int16, v32 int32, v64 int64, x int) {
|
|
// amd64:-"TESTB"
|
|
_ = x << (v8 & 7)
|
|
// amd64:-"TESTW"
|
|
_ = x << (v16 & 15)
|
|
// amd64:-"TESTL"
|
|
_ = x << (v32 & 31)
|
|
// amd64:-"TESTQ"
|
|
_ = x << (v64 & 63)
|
|
}
|
|
|
|
// ------------------ //
|
|
// bounded shifts //
|
|
// ------------------ //
|
|
|
|
func rshGuarded64(v int64, s uint) int64 {
|
|
if s < 64 {
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
|
// wasm:-"Select",-".*LtU"
|
|
return v >> s
|
|
}
|
|
panic("shift too large")
|
|
}
|
|
|
|
func rshGuarded64U(v uint64, s uint) uint64 {
|
|
if s < 64 {
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
|
// wasm:-"Select",-".*LtU"
|
|
return v >> s
|
|
}
|
|
panic("shift too large")
|
|
}
|
|
|
|
func lshGuarded64(v int64, s uint) int64 {
|
|
if s < 64 {
|
|
// s390x:-"RISBGZ",-"AND",-"LOCGR"
|
|
// wasm:-"Select",-".*LtU"
|
|
return v << s
|
|
}
|
|
panic("shift too large")
|
|
}
|
|
|
|
func checkUnneededTrunc(tab *[100000]uint32, d uint64, v uint32, h uint16, b byte) (uint32, uint64) {
|
|
|
|
// ppc64le:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
|
|
// ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
|
|
f := tab[byte(v)^b]
|
|
// ppc64le:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
|
|
// ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
|
|
f += tab[byte(v)&b]
|
|
// ppc64le:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
|
|
// ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
|
|
f += tab[byte(v)|b]
|
|
// ppc64le:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
|
|
// ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
|
|
f += tab[uint16(v)&h]
|
|
// ppc64le:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
|
|
// ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
|
|
f += tab[uint16(v)^h]
|
|
// ppc64le:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
|
|
// ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
|
|
f += tab[uint16(v)|h]
|
|
// ppc64le:-".*AND",-"RLDICR",".*CLRLSLDI"
|
|
// ppc64:-".*AND",-"RLDICR",".*CLRLSLDI"
|
|
f += tab[v&0xff]
|
|
// ppc64le:-".*AND",".*CLRLSLWI"
|
|
// ppc64:-".*AND",".*CLRLSLWI"
|
|
f += 2 * uint32(uint16(d))
|
|
// ppc64le:-".*AND",-"RLDICR",".*CLRLSLDI"
|
|
// ppc64:-".*AND",-"RLDICR",".*CLRLSLDI"
|
|
g := 2 * uint64(uint32(d))
|
|
return f, g
|
|
}
|
|
|
|
func checkCombinedShifts(v8 uint8, v16 uint16, v32 uint32, x32 int32, v64 uint64) (uint8, uint16, uint32, uint64, int64) {
|
|
|
|
// ppc64le:-"AND","CLRLSLWI"
|
|
// ppc64:-"AND","CLRLSLWI"
|
|
f := (v8 & 0xF) << 2
|
|
// ppc64le:"CLRLSLWI"
|
|
// ppc64:"CLRLSLWI"
|
|
f += byte(v16) << 3
|
|
// ppc64le:-"AND","CLRLSLWI"
|
|
// ppc64:-"AND","CLRLSLWI"
|
|
g := (v16 & 0xFF) << 3
|
|
// ppc64le:-"AND","CLRLSLWI"
|
|
// ppc64:-"AND","CLRLSLWI"
|
|
h := (v32 & 0xFFFFF) << 2
|
|
// ppc64le:"CLRLSLDI"
|
|
// ppc64:"CLRLSLDI"
|
|
i := (v64 & 0xFFFFFFFF) << 5
|
|
// ppc64le:-"CLRLSLDI"
|
|
// ppc64:-"CLRLSLDI"
|
|
i += (v64 & 0xFFFFFFF) << 38
|
|
// ppc64le/power9:-"CLRLSLDI"
|
|
// ppc64/power9:-"CLRLSLDI"
|
|
i += (v64 & 0xFFFF00) << 10
|
|
// ppc64le/power9:-"SLD","EXTSWSLI"
|
|
// ppc64/power9:-"SLD","EXTSWSLI"
|
|
j := int64(x32+32) * 8
|
|
return f, g, h, i, j
|
|
}
|
|
|
|
func checkWidenAfterShift(v int64, u uint64) (int64, uint64) {
|
|
|
|
// ppc64le:-".*MOVW"
|
|
f := int32(v >> 32)
|
|
// ppc64le:".*MOVW"
|
|
f += int32(v >> 31)
|
|
// ppc64le:-".*MOVH"
|
|
g := int16(v >> 48)
|
|
// ppc64le:".*MOVH"
|
|
g += int16(v >> 30)
|
|
// ppc64le:-".*MOVH"
|
|
g += int16(f >> 16)
|
|
// ppc64le:-".*MOVB"
|
|
h := int8(v >> 56)
|
|
// ppc64le:".*MOVB"
|
|
h += int8(v >> 28)
|
|
// ppc64le:-".*MOVB"
|
|
h += int8(f >> 24)
|
|
// ppc64le:".*MOVB"
|
|
h += int8(f >> 16)
|
|
return int64(h), uint64(g)
|
|
}
|
|
|
|
func checkShiftAndMask32(v []uint32) {
|
|
i := 0
|
|
|
|
// ppc64le: "RLWNM\t[$]24, R[0-9]+, [$]12, [$]19, R[0-9]+"
|
|
// ppc64: "RLWNM\t[$]24, R[0-9]+, [$]12, [$]19, R[0-9]+"
|
|
v[i] = (v[i] & 0xFF00000) >> 8
|
|
i++
|
|
// ppc64le: "RLWNM\t[$]26, R[0-9]+, [$]22, [$]29, R[0-9]+"
|
|
// ppc64: "RLWNM\t[$]26, R[0-9]+, [$]22, [$]29, R[0-9]+"
|
|
v[i] = (v[i] & 0xFF00) >> 6
|
|
i++
|
|
// ppc64le: "MOVW\tR0"
|
|
// ppc64: "MOVW\tR0"
|
|
v[i] = (v[i] & 0xFF) >> 8
|
|
i++
|
|
// ppc64le: "MOVW\tR0"
|
|
// ppc64: "MOVW\tR0"
|
|
v[i] = (v[i] & 0xF000000) >> 28
|
|
i++
|
|
// ppc64le: "RLWNM\t[$]26, R[0-9]+, [$]24, [$]31, R[0-9]+"
|
|
// ppc64: "RLWNM\t[$]26, R[0-9]+, [$]24, [$]31, R[0-9]+"
|
|
v[i] = (v[i] >> 6) & 0xFF
|
|
i++
|
|
// ppc64le: "RLWNM\t[$]26, R[0-9]+, [$]12, [$]19, R[0-9]+"
|
|
// ppc64: "RLWNM\t[$]26, R[0-9]+, [$]12, [$]19, R[0-9]+"
|
|
v[i] = (v[i] >> 6) & 0xFF000
|
|
i++
|
|
// ppc64le: "MOVW\tR0"
|
|
// ppc64: "MOVW\tR0"
|
|
v[i] = (v[i] >> 20) & 0xFF000
|
|
i++
|
|
// ppc64le: "MOVW\tR0"
|
|
// ppc64: "MOVW\tR0"
|
|
v[i] = (v[i] >> 24) & 0xFF00
|
|
i++
|
|
}
|
|
|
|
func checkMergedShifts32(a [256]uint32, b [256]uint64, u uint32, v uint32) {
|
|
//ppc64le: -"CLRLSLDI", "RLWNM\t[$]10, R[0-9]+, [$]22, [$]29, R[0-9]+"
|
|
//ppc64: -"CLRLSLDI", "RLWNM\t[$]10, R[0-9]+, [$]22, [$]29, R[0-9]+"
|
|
a[0] = a[uint8(v>>24)]
|
|
//ppc64le: -"CLRLSLDI", "RLWNM\t[$]11, R[0-9]+, [$]21, [$]28, R[0-9]+"
|
|
//ppc64: -"CLRLSLDI", "RLWNM\t[$]11, R[0-9]+, [$]21, [$]28, R[0-9]+"
|
|
b[0] = b[uint8(v>>24)]
|
|
//ppc64le: -"CLRLSLDI", "RLWNM\t[$]15, R[0-9]+, [$]21, [$]28, R[0-9]+"
|
|
//ppc64: -"CLRLSLDI", "RLWNM\t[$]15, R[0-9]+, [$]21, [$]28, R[0-9]+"
|
|
b[1] = b[(v>>20)&0xFF]
|
|
//ppc64le: -"SLD", "RLWNM\t[$]10, R[0-9]+, [$]22, [$]28, R[0-9]+"
|
|
//ppc64: -"SLD", "RLWNM\t[$]10, R[0-9]+, [$]22, [$]28, R[0-9]+"
|
|
b[2] = b[v>>25]
|
|
}
|
|
|
|
// 128 bit shifts
|
|
|
|
func check128bitShifts(x, y uint64, bits uint) (uint64, uint64) {
|
|
s := bits & 63
|
|
ŝ := (64 - bits) & 63
|
|
// check that the shift operation has two commas (three operands)
|
|
// amd64:"SHRQ.*,.*,"
|
|
shr := x>>s | y<<ŝ
|
|
// amd64:"SHLQ.*,.*,"
|
|
shl := x<<s | y>>ŝ
|
|
return shr, shl
|
|
}
|