mirror of
https://github.com/golang/go
synced 2024-11-27 02:31:18 -07:00
hash/crc32: optimize arm64 crc32 implementation
ARMv8 defines crc32 instruction. Comparing to the original crc32 calculation, this patch makes use of crc32 instructions to do crc32 calculation instead of the multiple lookup table algorithms. ARMv8 provides IEEE and Castagnoli polynomials for crc32 calculation so that the perfomance of these two types of crc32 get significant improved. name old time/op new time/op delta CRC32/poly=IEEE/size=15/align=0-32 117ns ± 0% 38ns ± 0% -67.44% CRC32/poly=IEEE/size=15/align=1-32 117ns ± 0% 38ns ± 0% -67.52% CRC32/poly=IEEE/size=40/align=0-32 129ns ± 0% 41ns ± 0% -68.37% CRC32/poly=IEEE/size=40/align=1-32 129ns ± 0% 41ns ± 0% -68.29% CRC32/poly=IEEE/size=512/align=0-32 828ns ± 0% 246ns ± 0% -70.29% CRC32/poly=IEEE/size=512/align=1-32 828ns ± 0% 132ns ± 0% -84.06% CRC32/poly=IEEE/size=1kB/align=0-32 1.58µs ± 0% 0.46µs ± 0% -70.98% CRC32/poly=IEEE/size=1kB/align=1-32 1.58µs ± 0% 0.46µs ± 0% -70.92% CRC32/poly=IEEE/size=4kB/align=0-32 6.06µs ± 0% 1.74µs ± 0% -71.27% CRC32/poly=IEEE/size=4kB/align=1-32 6.10µs ± 0% 1.74µs ± 0% -71.44% CRC32/poly=IEEE/size=32kB/align=0-32 48.3µs ± 0% 13.7µs ± 0% -71.61% CRC32/poly=IEEE/size=32kB/align=1-32 48.3µs ± 0% 13.7µs ± 0% -71.60% CRC32/poly=Castagnoli/size=15/align=0-32 116ns ± 0% 38ns ± 0% -67.07% CRC32/poly=Castagnoli/size=15/align=1-32 116ns ± 0% 38ns ± 0% -66.90% CRC32/poly=Castagnoli/size=40/align=0-32 127ns ± 0% 40ns ± 0% -68.11% CRC32/poly=Castagnoli/size=40/align=1-32 127ns ± 0% 40ns ± 0% -68.11% CRC32/poly=Castagnoli/size=512/align=0-32 828ns ± 0% 132ns ± 0% -84.06% CRC32/poly=Castagnoli/size=512/align=1-32 827ns ± 0% 132ns ± 0% -84.04% CRC32/poly=Castagnoli/size=1kB/align=0-32 1.59µs ± 0% 0.22µs ± 0% -85.89% CRC32/poly=Castagnoli/size=1kB/align=1-32 1.58µs ± 0% 0.22µs ± 0% -85.79% CRC32/poly=Castagnoli/size=4kB/align=0-32 6.14µs ± 0% 0.77µs ± 0% -87.40% CRC32/poly=Castagnoli/size=4kB/align=1-32 6.06µs ± 0% 0.77µs ± 0% -87.25% CRC32/poly=Castagnoli/size=32kB/align=0-32 48.3µs ± 0% 5.9µs ± 0% -87.71% CRC32/poly=Castagnoli/size=32kB/align=1-32 48.4µs ± 0% 6.0µs ± 0% -87.69% CRC32/poly=Koopman/size=15/align=0-32 104ns ± 0% 104ns ± 0% +0.00% CRC32/poly=Koopman/size=15/align=1-32 104ns ± 0% 104ns ± 0% +0.00% CRC32/poly=Koopman/size=40/align=0-32 235ns ± 0% 235ns ± 0% +0.00% CRC32/poly=Koopman/size=40/align=1-32 235ns ± 0% 235ns ± 0% +0.00% CRC32/poly=Koopman/size=512/align=0-32 2.71µs ± 0% 2.71µs ± 0% -0.07% CRC32/poly=Koopman/size=512/align=1-32 2.71µs ± 0% 2.71µs ± 0% -0.04% CRC32/poly=Koopman/size=1kB/align=0-32 5.40µs ± 0% 5.39µs ± 0% -0.06% CRC32/poly=Koopman/size=1kB/align=1-32 5.40µs ± 0% 5.40µs ± 0% +0.02% CRC32/poly=Koopman/size=4kB/align=0-32 21.5µs ± 0% 21.5µs ± 0% -0.16% CRC32/poly=Koopman/size=4kB/align=1-32 21.5µs ± 0% 21.5µs ± 0% -0.05% CRC32/poly=Koopman/size=32kB/align=0-32 172µs ± 0% 172µs ± 0% -0.07% CRC32/poly=Koopman/size=32kB/align=1-32 172µs ± 0% 172µs ± 0% -0.01% name old speed new speed delta CRC32/poly=IEEE/size=15/align=0-32 128MB/s ± 0% 394MB/s ± 0% +207.95% CRC32/poly=IEEE/size=15/align=1-32 128MB/s ± 0% 394MB/s ± 0% +208.09% CRC32/poly=IEEE/size=40/align=0-32 310MB/s ± 0% 979MB/s ± 0% +216.07% CRC32/poly=IEEE/size=40/align=1-32 310MB/s ± 0% 979MB/s ± 0% +216.16% CRC32/poly=IEEE/size=512/align=0-32 618MB/s ± 0% 2074MB/s ± 0% +235.72% CRC32/poly=IEEE/size=512/align=1-32 618MB/s ± 0% 3852MB/s ± 0% +523.55% CRC32/poly=IEEE/size=1kB/align=0-32 646MB/s ± 0% 2225MB/s ± 0% +244.57% CRC32/poly=IEEE/size=1kB/align=1-32 647MB/s ± 0% 2225MB/s ± 0% +243.87% CRC32/poly=IEEE/size=4kB/align=0-32 676MB/s ± 0% 2352MB/s ± 0% +248.02% CRC32/poly=IEEE/size=4kB/align=1-32 672MB/s ± 0% 2352MB/s ± 0% +250.15% CRC32/poly=IEEE/size=32kB/align=0-32 678MB/s ± 0% 2387MB/s ± 0% +252.17% CRC32/poly=IEEE/size=32kB/align=1-32 678MB/s ± 0% 2388MB/s ± 0% +252.11% CRC32/poly=Castagnoli/size=15/align=0-32 129MB/s ± 0% 393MB/s ± 0% +205.51% CRC32/poly=Castagnoli/size=15/align=1-32 129MB/s ± 0% 390MB/s ± 0% +203.41% CRC32/poly=Castagnoli/size=40/align=0-32 314MB/s ± 0% 988MB/s ± 0% +215.04% CRC32/poly=Castagnoli/size=40/align=1-32 314MB/s ± 0% 987MB/s ± 0% +214.68% CRC32/poly=Castagnoli/size=512/align=0-32 618MB/s ± 0% 3860MB/s ± 0% +524.32% CRC32/poly=Castagnoli/size=512/align=1-32 619MB/s ± 0% 3859MB/s ± 0% +523.66% CRC32/poly=Castagnoli/size=1kB/align=0-32 645MB/s ± 0% 4568MB/s ± 0% +608.56% CRC32/poly=Castagnoli/size=1kB/align=1-32 650MB/s ± 0% 4567MB/s ± 0% +602.94% CRC32/poly=Castagnoli/size=4kB/align=0-32 667MB/s ± 0% 5297MB/s ± 0% +693.81% CRC32/poly=Castagnoli/size=4kB/align=1-32 676MB/s ± 0% 5297MB/s ± 0% +684.00% CRC32/poly=Castagnoli/size=32kB/align=0-32 678MB/s ± 0% 5519MB/s ± 0% +713.83% CRC32/poly=Castagnoli/size=32kB/align=1-32 677MB/s ± 0% 5497MB/s ± 0% +712.04% CRC32/poly=Koopman/size=15/align=0-32 143MB/s ± 0% 144MB/s ± 0% +0.27% CRC32/poly=Koopman/size=15/align=1-32 143MB/s ± 0% 144MB/s ± 0% +0.33% CRC32/poly=Koopman/size=40/align=0-32 169MB/s ± 0% 170MB/s ± 0% +0.12% CRC32/poly=Koopman/size=40/align=1-32 170MB/s ± 0% 170MB/s ± 0% +0.08% CRC32/poly=Koopman/size=512/align=0-32 189MB/s ± 0% 189MB/s ± 0% +0.07% CRC32/poly=Koopman/size=512/align=1-32 189MB/s ± 0% 189MB/s ± 0% +0.04% CRC32/poly=Koopman/size=1kB/align=0-32 190MB/s ± 0% 190MB/s ± 0% +0.05% CRC32/poly=Koopman/size=1kB/align=1-32 190MB/s ± 0% 190MB/s ± 0% -0.01% CRC32/poly=Koopman/size=4kB/align=0-32 190MB/s ± 0% 190MB/s ± 0% +0.15% CRC32/poly=Koopman/size=4kB/align=1-32 190MB/s ± 0% 191MB/s ± 0% +0.05% CRC32/poly=Koopman/size=32kB/align=0-32 191MB/s ± 0% 191MB/s ± 0% +0.06% CRC32/poly=Koopman/size=32kB/align=1-32 191MB/s ± 0% 191MB/s ± 0% +0.02% Also fix a bug of arm64 assembler The optimization is mainly contributed by Fangming.Fang <fangming.fang@arm.com> Change-Id: I900678c2e445d7e8ad9e2a9ab3305d649230905f Reviewed-on: https://go-review.googlesource.com/40074 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
This commit is contained in:
parent
aaf4682171
commit
ab636b899c
@ -2564,7 +2564,7 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
|
||||
}
|
||||
o1 |= ((uint32(v) & 0x20) << (31 - 5)) | ((uint32(v) & 0x1F) << 19)
|
||||
o1 |= uint32(c.brdist(p, 0, 14, 2) << 5)
|
||||
o1 |= uint32(p.Reg)
|
||||
o1 |= uint32(p.Reg & 31)
|
||||
|
||||
case 41: /* eret, nop, others with no operands */
|
||||
o1 = c.op0(p, p.As)
|
||||
|
51
src/hash/crc32/crc32_arm64.go
Normal file
51
src/hash/crc32/crc32_arm64.go
Normal file
@ -0,0 +1,51 @@
|
||||
// Copyright 2017 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// ARM64-specific hardware-assisted CRC32 algorithms. See crc32.go for a
|
||||
// description of the interface that each architecture-specific file
|
||||
// implements.
|
||||
|
||||
package crc32
|
||||
|
||||
func supportsCRC32() bool
|
||||
func castagnoliUpdate(crc uint32, p []byte) uint32
|
||||
func ieeeUpdate(crc uint32, p []byte) uint32
|
||||
|
||||
var hasCRC32 = supportsCRC32()
|
||||
|
||||
func archAvailableCastagnoli() bool {
|
||||
return hasCRC32
|
||||
}
|
||||
|
||||
func archInitCastagnoli() {
|
||||
if !hasCRC32 {
|
||||
panic("arch-specific crc32 instruction for Catagnoli not available")
|
||||
}
|
||||
}
|
||||
|
||||
func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
|
||||
if !hasCRC32 {
|
||||
panic("arch-specific crc32 instruction for Castagnoli not available")
|
||||
}
|
||||
|
||||
return ^castagnoliUpdate(^crc, p)
|
||||
}
|
||||
|
||||
func archAvailableIEEE() bool {
|
||||
return hasCRC32
|
||||
}
|
||||
|
||||
func archInitIEEE() {
|
||||
if !hasCRC32 {
|
||||
panic("arch-specific crc32 instruction for IEEE not available")
|
||||
}
|
||||
}
|
||||
|
||||
func archUpdateIEEE(crc uint32, p []byte) uint32 {
|
||||
if !hasCRC32 {
|
||||
panic("arch-specific crc32 instruction for IEEE not available")
|
||||
}
|
||||
|
||||
return ^ieeeUpdate(^crc, p)
|
||||
}
|
97
src/hash/crc32/crc32_arm64.s
Normal file
97
src/hash/crc32/crc32_arm64.s
Normal file
@ -0,0 +1,97 @@
|
||||
// Copyright 2017 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
// castagnoliUpdate updates the non-inverted crc with the given data.
|
||||
|
||||
// func castagnoliUpdate(crc uint32, p []byte) uint32
|
||||
TEXT ·castagnoliUpdate(SB),NOSPLIT,$0-36
|
||||
MOVWU crc+0(FP), R9 // CRC value
|
||||
MOVD p+8(FP), R13 // data pointer
|
||||
MOVD p_len+16(FP), R11 // len(p)
|
||||
|
||||
CMP $8, R11
|
||||
BLT less_than_8
|
||||
|
||||
update:
|
||||
MOVD.P 8(R13), R10
|
||||
CRC32CX R10, R9
|
||||
SUB $8, R11
|
||||
|
||||
CMP $8, R11
|
||||
BLT less_than_8
|
||||
|
||||
JMP update
|
||||
|
||||
less_than_8:
|
||||
TBZ $2, R11, less_than_4
|
||||
|
||||
MOVWU.P 4(R13), R10
|
||||
CRC32CW R10, R9
|
||||
|
||||
less_than_4:
|
||||
TBZ $1, R11, less_than_2
|
||||
|
||||
MOVHU.P 2(R13), R10
|
||||
CRC32CH R10, R9
|
||||
|
||||
less_than_2:
|
||||
TBZ $0, R11, done
|
||||
|
||||
MOVBU (R13), R10
|
||||
CRC32CB R10, R9
|
||||
|
||||
done:
|
||||
MOVWU R9, ret+32(FP)
|
||||
RET
|
||||
|
||||
// ieeeUpdate updates the non-inverted crc with the given data.
|
||||
|
||||
// func ieeeUpdate(crc uint32, p []byte) uint32
|
||||
TEXT ·ieeeUpdate(SB),NOSPLIT,$0-36
|
||||
MOVWU crc+0(FP), R9 // CRC value
|
||||
MOVD p+8(FP), R13 // data pointer
|
||||
MOVD p_len+16(FP), R11 // len(p)
|
||||
|
||||
CMP $8, R11
|
||||
BLT less_than_8
|
||||
|
||||
update:
|
||||
MOVD.P 8(R13), R10
|
||||
CRC32X R10, R9
|
||||
SUB $8, R11
|
||||
|
||||
CMP $8, R11
|
||||
BLT less_than_8
|
||||
|
||||
JMP update
|
||||
|
||||
less_than_8:
|
||||
TBZ $2, R11, less_than_4
|
||||
|
||||
MOVWU.P 4(R13), R10
|
||||
CRC32W R10, R9
|
||||
|
||||
less_than_4:
|
||||
TBZ $1, R11, less_than_2
|
||||
|
||||
MOVHU.P 2(R13), R10
|
||||
CRC32H R10, R9
|
||||
|
||||
less_than_2:
|
||||
TBZ $0, R11, done
|
||||
|
||||
MOVBU (R13), R10
|
||||
CRC32B R10, R9
|
||||
|
||||
done:
|
||||
MOVWU R9, ret+32(FP)
|
||||
RET
|
||||
|
||||
// func supportsCRC32() bool
|
||||
TEXT ·supportsCRC32(SB),NOSPLIT,$0-1
|
||||
MOVB runtime·supportCRC32(SB), R0
|
||||
MOVB R0, ret+0(FP)
|
||||
RET
|
@ -2,7 +2,7 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !amd64,!amd64p32,!s390x,!ppc64le
|
||||
// +build !amd64,!amd64p32,!s390x,!ppc64le,!arm64
|
||||
|
||||
package crc32
|
||||
|
||||
|
@ -4,6 +4,8 @@
|
||||
|
||||
package runtime
|
||||
|
||||
var supportCRC32 = false
|
||||
|
||||
//go:nosplit
|
||||
func cputicks() int64 {
|
||||
// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand().
|
||||
|
@ -4,7 +4,12 @@
|
||||
|
||||
package runtime
|
||||
|
||||
const (
|
||||
_ARM64_FEATURE_HAS_CRC32 = 0x80
|
||||
)
|
||||
|
||||
var randomNumber uint32
|
||||
var supportCRC32 bool
|
||||
|
||||
func archauxv(tag, val uintptr) {
|
||||
switch tag {
|
||||
@ -14,6 +19,8 @@ func archauxv(tag, val uintptr) {
|
||||
// it as a byte array.
|
||||
randomNumber = uint32(startupRandomData[4]) | uint32(startupRandomData[5])<<8 |
|
||||
uint32(startupRandomData[6])<<16 | uint32(startupRandomData[7])<<24
|
||||
case _AT_HWCAP:
|
||||
supportCRC32 = val & _ARM64_FEATURE_HAS_CRC32 != 0
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user