mirror of
https://github.com/golang/go
synced 2024-09-29 17:24:34 -06:00
hash/crc32: improve performance for ppc64le
This change improves the performance of crc32 for ppc64le by using vpmsum and other vector instructions in the algorithm. The testcase was updated to test more sizes. Fixes #19570 BenchmarkCRC32/poly=IEEE/size=15/align=0-8 90.5 81.8 -9.61% BenchmarkCRC32/poly=IEEE/size=15/align=1-8 89.7 81.7 -8.92% BenchmarkCRC32/poly=IEEE/size=40/align=0-8 93.2 61.1 -34.44% BenchmarkCRC32/poly=IEEE/size=40/align=1-8 92.8 60.9 -34.38% BenchmarkCRC32/poly=IEEE/size=512/align=0-8 501 55.8 -88.86% BenchmarkCRC32/poly=IEEE/size=512/align=1-8 502 132 -73.71% BenchmarkCRC32/poly=IEEE/size=1kB/align=0-8 947 69.9 -92.62% BenchmarkCRC32/poly=IEEE/size=1kB/align=1-8 946 144 -84.78% BenchmarkCRC32/poly=IEEE/size=4kB/align=0-8 3602 186 -94.84% BenchmarkCRC32/poly=IEEE/size=4kB/align=1-8 3603 263 -92.70% BenchmarkCRC32/poly=IEEE/size=32kB/align=0-8 28404 1338 -95.29% BenchmarkCRC32/poly=IEEE/size=32kB/align=1-8 28856 1405 -95.13% BenchmarkCRC32/poly=Castagnoli/size=15/align=0-8 89.7 81.8 -8.81% BenchmarkCRC32/poly=Castagnoli/size=15/align=1-8 89.8 81.9 -8.80% BenchmarkCRC32/poly=Castagnoli/size=40/align=0-8 93.8 61.4 -34.54% BenchmarkCRC32/poly=Castagnoli/size=40/align=1-8 94.3 61.3 -34.99% BenchmarkCRC32/poly=Castagnoli/size=512/align=0-8 503 56.4 -88.79% BenchmarkCRC32/poly=Castagnoli/size=512/align=1-8 502 132 -73.71% BenchmarkCRC32/poly=Castagnoli/size=1kB/align=0-8 941 70.2 -92.54% BenchmarkCRC32/poly=Castagnoli/size=1kB/align=1-8 943 145 -84.62% BenchmarkCRC32/poly=Castagnoli/size=4kB/align=0-8 3588 186 -94.82% BenchmarkCRC32/poly=Castagnoli/size=4kB/align=1-8 3595 264 -92.66% BenchmarkCRC32/poly=Castagnoli/size=32kB/align=0-8 28266 1323 -95.32% BenchmarkCRC32/poly=Castagnoli/size=32kB/align=1-8 28344 1404 -95.05% Change-Id: Ic4d8274c66e0e87bfba5f609f508a3877aee6bb5 Reviewed-on: https://go-review.googlesource.com/38184 Reviewed-by: David Chase <drchase@google.com>
This commit is contained in:
parent
16663a85ba
commit
b6cd22c277
@ -2,7 +2,7 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !amd64,!amd64p32,!s390x
|
||||
// +build !amd64,!amd64p32,!s390x,!ppc64le
|
||||
|
||||
package crc32
|
||||
|
||||
|
87
src/hash/crc32/crc32_ppc64le.go
Normal file
87
src/hash/crc32/crc32_ppc64le.go
Normal file
@ -0,0 +1,87 @@
|
||||
// Copyright 2017 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package crc32
|
||||
|
||||
import (
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
const (
|
||||
vecMinLen = 16
|
||||
vecAlignMask = 15 // align to 16 bytes
|
||||
crcIEEE = 1
|
||||
crcCast = 2
|
||||
)
|
||||
|
||||
//go:noescape
|
||||
func ppc64SlicingUpdateBy8(crc uint32, table8 *slicing8Table, p []byte) uint32
|
||||
|
||||
// this function requires the buffer to be 16 byte aligned and > 16 bytes long
|
||||
//go:noescape
|
||||
func vectorCrc32(crc uint32, poly uint32, p []byte) uint32
|
||||
|
||||
var archCastagnoliTable8 *slicing8Table
|
||||
|
||||
func archInitCastagnoli() {
|
||||
archCastagnoliTable8 = slicingMakeTable(Castagnoli)
|
||||
}
|
||||
|
||||
func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
|
||||
if len(p) >= 4*vecMinLen {
|
||||
// If not aligned then process the initial unaligned bytes
|
||||
|
||||
if uint64(uintptr(unsafe.Pointer(&p[0])))&uint64(vecAlignMask) != 0 {
|
||||
align := uint64(uintptr(unsafe.Pointer(&p[0]))) & uint64(vecAlignMask)
|
||||
newlen := vecMinLen - align
|
||||
crc = ppc64SlicingUpdateBy8(crc, archCastagnoliTable8, p[:newlen])
|
||||
p = p[newlen:]
|
||||
}
|
||||
// p should be aligned now
|
||||
aligned := len(p) & ^vecAlignMask
|
||||
crc = vectorCrc32(crc, crcCast, p[:aligned])
|
||||
p = p[aligned:]
|
||||
}
|
||||
if len(p) == 0 {
|
||||
return crc
|
||||
}
|
||||
return ppc64SlicingUpdateBy8(crc, archCastagnoliTable8, p)
|
||||
}
|
||||
|
||||
func archAvailableIEEE() bool {
|
||||
return true
|
||||
}
|
||||
func archAvailableCastagnoli() bool {
|
||||
return true
|
||||
}
|
||||
|
||||
var archIeeeTable8 *slicing8Table
|
||||
|
||||
func archInitIEEE() {
|
||||
// We still use slicing-by-8 for small buffers.
|
||||
archIeeeTable8 = slicingMakeTable(IEEE)
|
||||
}
|
||||
|
||||
// archUpdateIEEE calculates the checksum of p using vectorizedIEEE.
|
||||
func archUpdateIEEE(crc uint32, p []byte) uint32 {
|
||||
|
||||
// Check if vector code should be used. If not aligned, then handle those
|
||||
// first up to the aligned bytes.
|
||||
|
||||
if len(p) >= 4*vecMinLen {
|
||||
if uint64(uintptr(unsafe.Pointer(&p[0])))&uint64(vecAlignMask) != 0 {
|
||||
align := uint64(uintptr(unsafe.Pointer(&p[0]))) & uint64(vecAlignMask)
|
||||
newlen := vecMinLen - align
|
||||
crc = ppc64SlicingUpdateBy8(crc, archIeeeTable8, p[:newlen])
|
||||
p = p[newlen:]
|
||||
}
|
||||
aligned := len(p) & ^vecAlignMask
|
||||
crc = vectorCrc32(crc, crcIEEE, p[:aligned])
|
||||
p = p[aligned:]
|
||||
}
|
||||
if len(p) == 0 {
|
||||
return crc
|
||||
}
|
||||
return ppc64SlicingUpdateBy8(crc, archIeeeTable8, p)
|
||||
}
|
707
src/hash/crc32/crc32_ppc64le.s
Normal file
707
src/hash/crc32/crc32_ppc64le.s
Normal file
@ -0,0 +1,707 @@
|
||||
// Copyright 2017 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// The vectorized implementation found below is a derived work
|
||||
// from code written by Anton Blanchard <anton@au.ibm.com> found
|
||||
// at https://github.com/antonblanchard/crc32-vpmsum. The original
|
||||
// is dual licensed under GPL and Apache 2. As the copyright holder
|
||||
// for the work, IBM has contributed this new work under
|
||||
// the golang license.
|
||||
|
||||
// Changes include porting to Go assembler with modifications for
|
||||
// the Go ABI for ppc64le.
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
#define POWER8_OFFSET 132
|
||||
|
||||
#define off16 R16
|
||||
#define off32 R17
|
||||
#define off48 R18
|
||||
#define off64 R19
|
||||
#define off80 R20
|
||||
#define off96 R21
|
||||
#define off112 R22
|
||||
|
||||
#define const1 V24
|
||||
#define const2 V25
|
||||
|
||||
#define byteswap V26
|
||||
#define mask_32bit V27
|
||||
#define mask_64bit V28
|
||||
#define zeroes V29
|
||||
|
||||
#define MAX_SIZE 32*1024
|
||||
#define REFLECT
|
||||
|
||||
TEXT ·ppc64SlicingUpdateBy8(SB), NOSPLIT|NOFRAME, $0-44
|
||||
MOVWZ crc+0(FP), R3 // incoming crc
|
||||
MOVD table8+8(FP), R4 // *Table
|
||||
MOVD p+16(FP), R5
|
||||
MOVD p_len+24(FP), R6 // p len
|
||||
|
||||
CMP $0,R6 // len == 0?
|
||||
BNE start
|
||||
MOVW R3,ret+40(FP) // return crc
|
||||
RET
|
||||
|
||||
start:
|
||||
NOR R3,R3,R7 // ^crc
|
||||
MOVWZ R7,R7 // 32 bits
|
||||
CMP R6,$16
|
||||
MOVD R6,CTR
|
||||
BLT short
|
||||
SRAD $3,R6,R8 // 8 byte chunks
|
||||
MOVD R8,CTR
|
||||
|
||||
loop:
|
||||
MOVWZ 0(R5),R8 // 0-3 bytes of p ?Endian?
|
||||
MOVWZ 4(R5),R9 // 4-7 bytes of p
|
||||
MOVD R4,R10 // &tab[0]
|
||||
XOR R7,R8,R7 // crc ^= byte[0:3]
|
||||
RLDICL $40,R9,$56,R17 // p[7]
|
||||
SLD $2,R17,R17 // p[7]*4
|
||||
RLDICL $40,R7,$56,R8 // crc>>24
|
||||
ADD R17,R10,R17 // &tab[0][p[7]]
|
||||
SLD $2,R8,R8 // crc>>24*4
|
||||
RLDICL $48,R9,$56,R18 // p[6]
|
||||
SLD $2,R18,R18 // p[6]*4
|
||||
ADD $1024,R10,R10 // tab[1]
|
||||
MOVWZ 0(R17),R21 // tab[0][p[7]]
|
||||
RLDICL $56,R9,$56,R19 // p[5]
|
||||
ADD R10,R18,R18 // &tab[1][p[6]]
|
||||
SLD $2,R19,R19 // p[5]*4:1
|
||||
MOVWZ 0(R18),R22 // tab[1][p[6]]
|
||||
ADD $1024,R10,R10 // tab[2]
|
||||
XOR R21,R22,R21 // xor done R22
|
||||
ADD R19,R10,R19 // &tab[2][p[5]]
|
||||
ANDCC $255,R9,R20 // p[4] ??
|
||||
SLD $2,R20,R20 // p[4]*4
|
||||
MOVWZ 0(R19),R23 // tab[2][p[5]]
|
||||
ADD $1024,R10,R10 // &tab[3]
|
||||
ADD R20,R10,R20 // tab[3][p[4]]
|
||||
XOR R21,R23,R21 // xor done R23
|
||||
ADD $1024,R10,R10 // &tab[4]
|
||||
MOVWZ 0(R20),R24 // tab[3][p[4]]
|
||||
ADD R10,R8,R23 // &tab[4][crc>>24]
|
||||
XOR R21,R24,R21 // xor done R24
|
||||
MOVWZ 0(R23),R25 // tab[4][crc>>24]
|
||||
RLDICL $48,R7,$56,R24 // crc>>16&0xFF
|
||||
XOR R21,R25,R21 // xor done R25
|
||||
ADD $1024,R10,R10 // &tab[5]
|
||||
SLD $2,R24,R24 // crc>>16&0xFF*4
|
||||
ADD R24,R10,R24 // &tab[5][crc>>16&0xFF]
|
||||
MOVWZ 0(R24),R26 // tab[5][crc>>16&0xFF]
|
||||
XOR R21,R26,R21 // xor done R26
|
||||
RLDICL $56,R7,$56,R25 // crc>>8
|
||||
ADD $1024,R10,R10 // &tab[6]
|
||||
SLD $2,R25,R25 // crc>>8&FF*2
|
||||
ADD R25,R10,R25 // &tab[6][crc>>8&0xFF]
|
||||
MOVBZ R7,R26 // crc&0xFF
|
||||
ADD $1024,R10,R10 // &tab[7]
|
||||
MOVWZ 0(R25),R27 // tab[6][crc>>8&0xFF]
|
||||
SLD $2,R26,R26 // crc&0xFF*2
|
||||
XOR R21,R27,R21 // xor done R27
|
||||
ADD R26,R10,R26 // &tab[7][crc&0xFF]
|
||||
ADD $8,R5 // p = p[8:]
|
||||
MOVWZ 0(R26),R28 // tab[7][crc&0xFF]
|
||||
XOR R21,R28,R21 // xor done R28
|
||||
MOVWZ R21,R7 // crc for next round
|
||||
BC 16,0,loop // next 8 bytes
|
||||
ANDCC $7,R6,R8 // any leftover bytes
|
||||
BEQ done // none --> done
|
||||
MOVD R8,CTR // byte count
|
||||
|
||||
short:
|
||||
MOVBZ 0(R5),R8 // get v
|
||||
MOVBZ R7,R9 // byte(crc) -> R8 BE vs LE?
|
||||
MOVWZ R7,R14
|
||||
SRD $8,R14,R14 // crc>>8
|
||||
XOR R8,R9,R8 // byte(crc)^v -> R8
|
||||
ADD $1,R5 // ptr to next v
|
||||
SLD $2,R8 // convert index-> bytes
|
||||
ADD R8,R4,R9 // &tab[byte(crc)^v]
|
||||
MOVWZ 0(R9),R10 // tab[byte(crc)^v]
|
||||
XOR R10,R14,R7 // loop crc in R7
|
||||
MOVWZ R7,R7 // 32 bits
|
||||
BC 16,0,short
|
||||
done:
|
||||
NOR R7,R7,R7 // ^crc
|
||||
MOVW R7,ret+40(FP) // return crc
|
||||
RET
|
||||
|
||||
#ifdef BYTESWAP_DATA
|
||||
DATA ·byteswapcons+0(SB)/8,$0x0706050403020100
|
||||
DATA ·byteswapcons+8(SB)/8,$0x0f0e0d0c0b0a0908
|
||||
|
||||
GLOBL ·byteswapcons+0(SB),RODATA,$16
|
||||
#endif
|
||||
|
||||
TEXT ·vectorCrc32(SB), NOSPLIT|NOFRAME, $0-36
|
||||
MOVWZ crc+0(FP), R3 // incoming crc
|
||||
MOVWZ ctab+4(FP), R14 // crc poly id
|
||||
MOVD p+8(FP), R4
|
||||
MOVD p_len+16(FP), R5 // p len
|
||||
|
||||
// R3 = incoming crc
|
||||
// R14 = constant table identifier
|
||||
// R5 = address of bytes
|
||||
// R6 = length of bytes
|
||||
|
||||
// defines for index loads
|
||||
|
||||
MOVD $16,off16
|
||||
MOVD $32,off32
|
||||
MOVD $48,off48
|
||||
MOVD $64,off64
|
||||
MOVD $80,off80
|
||||
MOVD $96,off96
|
||||
MOVD $112,off112
|
||||
MOVD $0,R15
|
||||
|
||||
MOVD R3,R10 // save initial crc
|
||||
|
||||
NOR R3,R3,R3 // ^crc
|
||||
MOVWZ R3,R3 // 32 bits
|
||||
VXOR zeroes,zeroes,zeroes // clear the V reg
|
||||
VSPLTISW $-1,V0
|
||||
VSLDOI $4,V29,V0,mask_32bit
|
||||
VSLDOI $8,V29,V0,mask_64bit
|
||||
|
||||
VXOR V8,V8,V8
|
||||
MTVSRD R3,VS40 // crc initial value VS40 = V8
|
||||
|
||||
#ifdef REFLECT
|
||||
VSLDOI $8,zeroes,V8,V8 // or: VSLDOI V29,V8,V27,4 for top 32 bits?
|
||||
#else
|
||||
VSLDOI $4,V8,zeroes,V8
|
||||
#endif
|
||||
|
||||
#ifdef BYTESWAP_DATA
|
||||
MOVD $·byteswapcons(SB),R3
|
||||
LVX (R3),byteswap
|
||||
#endif
|
||||
|
||||
CMPU R5,$256 // length of bytes
|
||||
BLT short
|
||||
|
||||
RLDICR $0,R5,$56,R6 // chunk to process
|
||||
|
||||
// First step for larger sizes
|
||||
l1: MOVD $32768,R7
|
||||
MOVD R7,R9
|
||||
CMP R6,R7 // compare R6, R7 (MAX SIZE)
|
||||
BGT top // less than MAX, just do remainder
|
||||
MOVD R6,R7
|
||||
top:
|
||||
SUB R7,R6,R6
|
||||
|
||||
// mainloop does 128 bytes at a time
|
||||
SRD $7,R7
|
||||
|
||||
// determine the offset into the constants table to start with.
|
||||
// Each constant is 128 bytes, used against 16 bytes of data.
|
||||
SLD $4,R7,R8
|
||||
SRD $3,R9,R9
|
||||
SUB R8,R9,R8
|
||||
|
||||
// The last iteration is reduced in a separate step
|
||||
ADD $-1,R7
|
||||
MOVD R7,CTR
|
||||
|
||||
// Determine which constant table (depends on poly)
|
||||
CMP R14,$1
|
||||
BNE castTable
|
||||
MOVD $·IEEEConst(SB),R3
|
||||
BR startConst
|
||||
castTable:
|
||||
MOVD $·CastConst(SB),R3
|
||||
|
||||
startConst:
|
||||
ADD R3,R8,R3 // starting point in constants table
|
||||
|
||||
VXOR V0,V0,V0 // clear the V regs
|
||||
VXOR V1,V1,V1
|
||||
VXOR V2,V2,V2
|
||||
VXOR V3,V3,V3
|
||||
VXOR V4,V4,V4
|
||||
VXOR V5,V5,V5
|
||||
VXOR V6,V6,V6
|
||||
VXOR V7,V7,V7
|
||||
|
||||
LVX (R3),const1 // loading constant values
|
||||
|
||||
CMP R15,$1 // Identify warm up pass
|
||||
BEQ next
|
||||
|
||||
// First warm up pass: load the bytes to process
|
||||
LVX (R4),V16
|
||||
LVX (R4+off16),V17
|
||||
LVX (R4+off32),V18
|
||||
LVX (R4+off48),V19
|
||||
LVX (R4+off64),V20
|
||||
LVX (R4+off80),V21
|
||||
LVX (R4+off96),V22
|
||||
LVX (R4+off112),V23
|
||||
ADD $128,R4 // bump up to next 128 bytes in buffer
|
||||
|
||||
VXOR V16,V8,V16 // xor in inital CRC in V8
|
||||
|
||||
next:
|
||||
BC 18,0,first_warm_up_done
|
||||
|
||||
ADD $16,R3 // bump up to next constants
|
||||
LVX (R3),const2 // table values
|
||||
|
||||
VPMSUMD V16,const1,V8 // second warm up pass
|
||||
LVX (R4),V16 // load from buffer
|
||||
OR $0,R2,R2
|
||||
|
||||
VPMSUMD V17,const1,V9 // vpmsumd with constants
|
||||
LVX (R4+off16),V17 // load next from buffer
|
||||
OR $0,R2,R2
|
||||
|
||||
VPMSUMD V18,const1,V10 // vpmsumd with constants
|
||||
LVX (R4+off32),V18 // load next from buffer
|
||||
OR $0,R2,R2
|
||||
|
||||
VPMSUMD V19,const1,V11 // vpmsumd with constants
|
||||
LVX (R4+off48),V19 // load next from buffer
|
||||
OR $0,R2,R2
|
||||
|
||||
VPMSUMD V20,const1,V12 // vpmsumd with constants
|
||||
LVX (R4+off64),V20 // load next from buffer
|
||||
OR $0,R2,R2
|
||||
|
||||
VPMSUMD V21,const1,V13 // vpmsumd with constants
|
||||
LVX (R4+off80),V21 // load next from buffer
|
||||
OR $0,R2,R2
|
||||
|
||||
VPMSUMD V22,const1,V14 // vpmsumd with constants
|
||||
LVX (R4+off96),V22 // load next from buffer
|
||||
OR $0,R2,R2
|
||||
|
||||
VPMSUMD V23,const1,V15 // vpmsumd with constants
|
||||
LVX (R4+off112),V23 // load next from buffer
|
||||
|
||||
ADD $128,R4 // bump up to next 128 bytes in buffer
|
||||
|
||||
BC 18,0,first_cool_down
|
||||
|
||||
cool_top:
|
||||
LVX (R3),const1 // constants
|
||||
ADD $16,R3 // inc to next constants
|
||||
OR $0,R2,R2
|
||||
|
||||
VXOR V0,V8,V0 // xor in previous vpmsumd
|
||||
VPMSUMD V16,const2,V8 // vpmsumd with constants
|
||||
LVX (R4),V16 // buffer
|
||||
OR $0,R2,R2
|
||||
|
||||
VXOR V1,V9,V1 // xor in previous
|
||||
VPMSUMD V17,const2,V9 // vpmsumd with constants
|
||||
LVX (R4+off16),V17 // next in buffer
|
||||
OR $0,R2,R2
|
||||
|
||||
VXOR V2,V10,V2 // xor in previous
|
||||
VPMSUMD V18,const2,V10 // vpmsumd with constants
|
||||
LVX (R4+off32),V18 // next in buffer
|
||||
OR $0,R2,R2
|
||||
|
||||
VXOR V3,V11,V3 // xor in previous
|
||||
VPMSUMD V19,const2,V11 // vpmsumd with constants
|
||||
LVX (R4+off48),V19 // next in buffer
|
||||
LVX (R3),const2 // get next constant
|
||||
OR $0,R2,R2
|
||||
|
||||
VXOR V4,V12,V4 // xor in previous
|
||||
VPMSUMD V20,const1,V12 // vpmsumd with constants
|
||||
LVX (R4+off64),V20 // next in buffer
|
||||
OR $0,R2,R2
|
||||
|
||||
VXOR V5,V13,V5 // xor in previous
|
||||
VPMSUMD V21,const1,V13 // vpmsumd with constants
|
||||
LVX (R4+off80),V21 // next in buffer
|
||||
OR $0,R2,R2
|
||||
|
||||
VXOR V6,V14,V6 // xor in previous
|
||||
VPMSUMD V22,const1,V14 // vpmsumd with constants
|
||||
LVX (R4+off96),V22 // next in buffer
|
||||
OR $0,R2,R2
|
||||
|
||||
VXOR V7,V15,V7 // xor in previous
|
||||
VPMSUMD V23,const1,V15 // vpmsumd with constants
|
||||
LVX (R4+off112),V23 // next in buffer
|
||||
|
||||
ADD $128,R4 // bump up buffer pointer
|
||||
BC 16,0,cool_top // are we done?
|
||||
|
||||
first_cool_down:
|
||||
|
||||
// load the constants
|
||||
// xor in the previous value
|
||||
// vpmsumd the result with constants
|
||||
|
||||
LVX (R3),const1
|
||||
ADD $16,R3
|
||||
|
||||
VXOR V0,V8,V0
|
||||
VPMSUMD V16,const1,V8
|
||||
OR $0,R2,R2
|
||||
|
||||
VXOR V1,V9,V1
|
||||
VPMSUMD V17,const1,V9
|
||||
OR $0,R2,R2
|
||||
|
||||
VXOR V2,V10,V2
|
||||
VPMSUMD V18,const1,V10
|
||||
OR $0,R2,R2
|
||||
|
||||
VXOR V3,V11,V3
|
||||
VPMSUMD V19,const1,V11
|
||||
OR $0,R2,R2
|
||||
|
||||
VXOR V4,V12,V4
|
||||
VPMSUMD V20,const1,V12
|
||||
OR $0,R2,R2
|
||||
|
||||
VXOR V5,V13,V5
|
||||
VPMSUMD V21,const1,V13
|
||||
OR $0,R2,R2
|
||||
|
||||
VXOR V6,V14,V6
|
||||
VPMSUMD V22,const1,V14
|
||||
OR $0,R2,R2
|
||||
|
||||
VXOR V7,V15,V7
|
||||
VPMSUMD V23,const1,V15
|
||||
OR $0,R2,R2
|
||||
|
||||
second_cool_down:
|
||||
|
||||
VXOR V0,V8,V0
|
||||
VXOR V1,V9,V1
|
||||
VXOR V2,V10,V2
|
||||
VXOR V3,V11,V3
|
||||
VXOR V4,V12,V4
|
||||
VXOR V5,V13,V5
|
||||
VXOR V6,V14,V6
|
||||
VXOR V7,V15,V7
|
||||
|
||||
#ifdef REFLECT
|
||||
VSLDOI $4,V0,zeroes,V0
|
||||
VSLDOI $4,V1,zeroes,V1
|
||||
VSLDOI $4,V2,zeroes,V2
|
||||
VSLDOI $4,V3,zeroes,V3
|
||||
VSLDOI $4,V4,zeroes,V4
|
||||
VSLDOI $4,V5,zeroes,V5
|
||||
VSLDOI $4,V6,zeroes,V6
|
||||
VSLDOI $4,V7,zeroes,V7
|
||||
#endif
|
||||
|
||||
LVX (R4),V8
|
||||
LVX (R4+off16),V9
|
||||
LVX (R4+off32),V10
|
||||
LVX (R4+off48),V11
|
||||
LVX (R4+off64),V12
|
||||
LVX (R4+off80),V13
|
||||
LVX (R4+off96),V14
|
||||
LVX (R4+off112),V15
|
||||
|
||||
ADD $128,R4
|
||||
|
||||
VXOR V0,V8,V16
|
||||
VXOR V1,V9,V17
|
||||
VXOR V2,V10,V18
|
||||
VXOR V3,V11,V19
|
||||
VXOR V4,V12,V20
|
||||
VXOR V5,V13,V21
|
||||
VXOR V6,V14,V22
|
||||
VXOR V7,V15,V23
|
||||
|
||||
MOVD $1,R15
|
||||
CMP $0,R6
|
||||
ADD $128,R6
|
||||
|
||||
BNE l1
|
||||
ANDCC $127,R5
|
||||
SUBC R5,$128,R6
|
||||
ADD R3,R6,R3
|
||||
|
||||
SRD $4,R5,R7
|
||||
MOVD R7,CTR
|
||||
LVX (R3),V0
|
||||
LVX (R3+off16),V1
|
||||
LVX (R3+off32),V2
|
||||
LVX (R3+off48),V3
|
||||
LVX (R3+off64),V4
|
||||
LVX (R3+off80),V5
|
||||
LVX (R3+off96),V6
|
||||
LVX (R3+off112),V7
|
||||
|
||||
ADD $128,R3
|
||||
|
||||
VPMSUMW V16,V0,V0
|
||||
VPMSUMW V17,V1,V1
|
||||
VPMSUMW V18,V2,V2
|
||||
VPMSUMW V19,V3,V3
|
||||
VPMSUMW V20,V4,V4
|
||||
VPMSUMW V21,V5,V5
|
||||
VPMSUMW V22,V6,V6
|
||||
VPMSUMW V23,V7,V7
|
||||
|
||||
// now reduce the tail
|
||||
|
||||
CMP $0,R7
|
||||
BEQ next1
|
||||
|
||||
LVX (R4),V16
|
||||
LVX (R3),V17
|
||||
VPMSUMW V16,V17,V16
|
||||
VXOR V0,V16,V0
|
||||
BC 18,0,next1
|
||||
|
||||
LVX (R4+off16),V16
|
||||
LVX (R3+off16),V17
|
||||
VPMSUMW V16,V17,V16
|
||||
VXOR V0,V16,V0
|
||||
BC 18,0,next1
|
||||
|
||||
LVX (R4+off32),V16
|
||||
LVX (R3+off32),V17
|
||||
VPMSUMW V16,V17,V16
|
||||
VXOR V0,V16,V0
|
||||
BC 18,0,next1
|
||||
|
||||
LVX (R4+off48),V16
|
||||
LVX (R3+off48),V17
|
||||
VPMSUMW V16,V17,V16
|
||||
VXOR V0,V16,V0
|
||||
BC 18,0,next1
|
||||
|
||||
LVX (R4+off64),V16
|
||||
LVX (R3+off64),V17
|
||||
VPMSUMW V16,V17,V16
|
||||
VXOR V0,V16,V0
|
||||
BC 18,0,next1
|
||||
|
||||
LVX (R4+off80),V16
|
||||
LVX (R3+off80),V17
|
||||
VPMSUMW V16,V17,V16
|
||||
VXOR V0,V16,V0
|
||||
BC 18,0,next1
|
||||
|
||||
LVX (R4+off96),V16
|
||||
LVX (R3+off96),V17
|
||||
VPMSUMW V16,V17,V16
|
||||
VXOR V0,V16,V0
|
||||
|
||||
next1:
|
||||
VXOR V0,V1,V0
|
||||
VXOR V2,V3,V2
|
||||
VXOR V4,V5,V4
|
||||
VXOR V6,V7,V6
|
||||
VXOR V0,V2,V0
|
||||
VXOR V4,V6,V4
|
||||
VXOR V0,V4,V0
|
||||
|
||||
barrett_reduction:
|
||||
|
||||
CMP R14,$1
|
||||
BNE barcstTable
|
||||
MOVD $·IEEEBarConst(SB),R3
|
||||
BR startbarConst
|
||||
barcstTable:
|
||||
MOVD $·CastBarConst(SB),R3
|
||||
|
||||
startbarConst:
|
||||
LVX (R3),const1
|
||||
LVX (R3+off16),const2
|
||||
|
||||
VSLDOI $8,V0,V0,V1
|
||||
VXOR V0,V1,V0
|
||||
|
||||
#ifdef REFLECT
|
||||
VSPLTISB $1,V1
|
||||
VSL V0,V1,V0
|
||||
#endif
|
||||
|
||||
VAND V0,mask_64bit,V0
|
||||
|
||||
#ifndef REFLECT
|
||||
|
||||
VPMSUMD V0,const1,V1
|
||||
VSLDOI $8,zeroes,V1,V1
|
||||
VPMSUMD V1,const2,V1
|
||||
VXOR V0,V1,V0
|
||||
VSLDOI $8,V0,zeroes,V0
|
||||
|
||||
#else
|
||||
|
||||
VAND V0,mask_32bit,V1
|
||||
VPMSUMD V1,const1,V1
|
||||
VAND V1,mask_32bit,V1
|
||||
VPMSUMD V1,const2,V1
|
||||
VXOR V0,V1,V0
|
||||
VSLDOI $4,V0,zeroes,V0
|
||||
|
||||
#endif
|
||||
|
||||
MFVSRD VS32,R3 // VS32 = V0
|
||||
|
||||
NOR R3,R3,R3 // return ^crc
|
||||
MOVW R3,ret+32(FP)
|
||||
RET
|
||||
|
||||
first_warm_up_done:
|
||||
|
||||
LVX (R3),const1
|
||||
ADD $16,R3
|
||||
|
||||
VPMSUMD V16,const1,V8
|
||||
VPMSUMD V17,const1,V9
|
||||
VPMSUMD V18,const1,V10
|
||||
VPMSUMD V19,const1,V11
|
||||
VPMSUMD V20,const1,V12
|
||||
VPMSUMD V21,const1,V13
|
||||
VPMSUMD V22,const1,V14
|
||||
VPMSUMD V23,const1,V15
|
||||
|
||||
BR second_cool_down
|
||||
|
||||
short:
|
||||
CMP $0,R5
|
||||
BEQ zero
|
||||
|
||||
// compute short constants
|
||||
|
||||
CMP R14,$1
|
||||
BNE castshTable
|
||||
MOVD $·IEEEConst(SB),R3
|
||||
ADD $4080,R3
|
||||
BR startshConst
|
||||
castshTable:
|
||||
MOVD $·CastConst(SB),R3
|
||||
ADD $4080,R3
|
||||
|
||||
startshConst:
|
||||
SUBC R5,$256,R6 // sub from 256
|
||||
ADD R3,R6,R3
|
||||
|
||||
// calculate where to start
|
||||
|
||||
SRD $4,R5,R7
|
||||
MOVD R7,CTR
|
||||
|
||||
VXOR V19,V19,V19
|
||||
VXOR V20,V20,V20
|
||||
|
||||
LVX (R4),V0
|
||||
LVX (R3),V16
|
||||
VXOR V0,V8,V0
|
||||
VPMSUMW V0,V16,V0
|
||||
BC 18,0,v0
|
||||
|
||||
LVX (R4+off16),V1
|
||||
LVX (R3+off16),V17
|
||||
VPMSUMW V1,V17,V1
|
||||
BC 18,0,v1
|
||||
|
||||
LVX (R4+off32),V2
|
||||
LVX (R3+off32),V16
|
||||
VPMSUMW V2,V16,V2
|
||||
BC 18,0,v2
|
||||
|
||||
LVX (R4+off48),V3
|
||||
LVX (R3+off48),V17
|
||||
VPMSUMW V3,V17,V3
|
||||
BC 18,0,v3
|
||||
|
||||
LVX (R4+off64),V4
|
||||
LVX (R3+off64),V16
|
||||
VPMSUMW V4,V16,V4
|
||||
BC 18,0,v4
|
||||
|
||||
LVX (R4+off80),V5
|
||||
LVX (R3+off80),V17
|
||||
VPMSUMW V5,V17,V5
|
||||
BC 18,0,v5
|
||||
|
||||
LVX (R4+off96),V6
|
||||
LVX (R3+off96),V16
|
||||
VPMSUMW V6,V16,V6
|
||||
BC 18,0,v6
|
||||
|
||||
LVX (R4+off112),V7
|
||||
LVX (R3+off112),V17
|
||||
VPMSUMW V7,V17,V7
|
||||
BC 18,0,v7
|
||||
|
||||
ADD $128,R3
|
||||
ADD $128,R4
|
||||
|
||||
LVX (R4),V8
|
||||
LVX (R3),V16
|
||||
VPMSUMW V8,V16,V8
|
||||
BC 18,0,v8
|
||||
|
||||
LVX (R4+off16),V9
|
||||
LVX (R3+off16),V17
|
||||
VPMSUMW V9,V17,V9
|
||||
BC 18,0,v9
|
||||
|
||||
LVX (R4+off32),V10
|
||||
LVX (R3+off32),V16
|
||||
VPMSUMW V10,V16,V10
|
||||
BC 18,0,v10
|
||||
|
||||
LVX (R4+off48),V11
|
||||
LVX (R3+off48),V17
|
||||
VPMSUMW V11,V17,V11
|
||||
BC 18,0,v11
|
||||
|
||||
LVX (R4+off64),V12
|
||||
LVX (R3+off64),V16
|
||||
VPMSUMW V12,V16,V12
|
||||
BC 18,0,v12
|
||||
|
||||
LVX (R4+off80),V13
|
||||
LVX (R3+off80),V17
|
||||
VPMSUMW V13,V17,V13
|
||||
BC 18,0,v13
|
||||
|
||||
LVX (R4+off96),V14
|
||||
LVX (R3+off96),V16
|
||||
VPMSUMW V14,V16,V14
|
||||
BC 18,0,v14
|
||||
|
||||
LVX (R4+off112),V15
|
||||
LVX (R3+off112),V17
|
||||
VPMSUMW V15,V17,V15
|
||||
|
||||
VXOR V19,V15,V19
|
||||
v14: VXOR V20,V14,V20
|
||||
v13: VXOR V19,V13,V19
|
||||
v12: VXOR V20,V12,V20
|
||||
v11: VXOR V19,V11,V19
|
||||
v10: VXOR V20,V10,V20
|
||||
v9: VXOR V19,V9,V19
|
||||
v8: VXOR V20,V8,V20
|
||||
v7: VXOR V19,V7,V19
|
||||
v6: VXOR V20,V6,V20
|
||||
v5: VXOR V19,V5,V19
|
||||
v4: VXOR V20,V4,V20
|
||||
v3: VXOR V19,V3,V19
|
||||
v2: VXOR V20,V2,V20
|
||||
v1: VXOR V19,V1,V19
|
||||
v0: VXOR V20,V0,V20
|
||||
|
||||
VXOR V19,V20,V0
|
||||
|
||||
BR barrett_reduction
|
||||
|
||||
zero:
|
||||
// This case is the original crc, so just return it
|
||||
MOVW R10,ret+32(FP)
|
||||
RET
|
3286
src/hash/crc32/crc32_table_ppc64le.s
Normal file
3286
src/hash/crc32/crc32_table_ppc64le.s
Normal file
File diff suppressed because it is too large
Load Diff
@ -76,8 +76,9 @@ func testCrossCheck(t *testing.T, crcFunc1, crcFunc2 func(crc uint32, b []byte)
|
||||
// The AMD64 implementation has some cutoffs at lengths 168*3=504 and
|
||||
// 1344*3=4032. We should make sure lengths around these values are in the
|
||||
// list.
|
||||
lengths := []int{0, 1, 2, 3, 4, 5, 10, 16, 50, 100, 128,
|
||||
500, 501, 502, 503, 504, 505, 512, 1000, 1024, 2000,
|
||||
lengths := []int{0, 1, 2, 3, 4, 5, 10, 16, 50, 63, 64, 65, 100,
|
||||
127, 128, 129, 255, 256, 257, 300, 312, 384, 416, 448, 480,
|
||||
500, 501, 502, 503, 504, 505, 512, 513, 1000, 1024, 2000,
|
||||
4030, 4031, 4032, 4033, 4036, 4040, 4048, 4096, 5000, 10000}
|
||||
for _, length := range lengths {
|
||||
p := make([]byte, length)
|
||||
|
150
src/hash/crc32/gen_const_ppc64le.go
Normal file
150
src/hash/crc32/gen_const_ppc64le.go
Normal file
@ -0,0 +1,150 @@
|
||||
// Copyright 2017 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
// Generate the constant table associated with the poly used by the
|
||||
// vpmsumd crc32 algorithm.
|
||||
//
|
||||
// go run gen_const_ppc64le.go
|
||||
//
|
||||
// generates crc32_table_ppc64le.s
|
||||
|
||||
// The following is derived from code written by Anton Blanchard
|
||||
// <anton@au.ibm.com> found at https://github.com/antonblanchard/crc32-vpmsum.
|
||||
// The original is dual licensed under GPL and Apache 2. As the copyright holder
|
||||
// for the work, IBM has contributed this new work under the golang license.
|
||||
|
||||
// This code was written in Go based on the original C implementation.
|
||||
|
||||
// This is a tool needed to generate the appropriate constants needed for
|
||||
// the vpmsum algorithm. It is included to generate new constant tables if
|
||||
// new polynomial values are included in the future.
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
)
|
||||
|
||||
var blocking = 32 * 1024
|
||||
|
||||
func reflect_bits(b uint64, nr uint) uint64 {
|
||||
var ref uint64
|
||||
|
||||
for bit := uint64(0); bit < uint64(nr); bit++ {
|
||||
if (b & uint64(1)) == 1 {
|
||||
ref |= (1 << (uint64(nr-1) - bit))
|
||||
}
|
||||
b = (b >> 1)
|
||||
}
|
||||
return ref
|
||||
}
|
||||
|
||||
func get_remainder(poly uint64, deg uint, n uint) uint64 {
|
||||
|
||||
rem, _ := xnmodp(n, poly, deg)
|
||||
return rem
|
||||
}
|
||||
|
||||
func get_quotient(poly uint64, bits, n uint) uint64 {
|
||||
|
||||
_, div := xnmodp(n, poly, bits)
|
||||
return div
|
||||
}
|
||||
|
||||
// xnmodp returns two values, p and div:
|
||||
// p is the representation of the binary polynomial x**n mod (x ** deg + "poly")
|
||||
// That is p is the binary representation of the modulus polynomial except for its highest-order term.
|
||||
// div is the binary representation of the polynomial x**n / (x ** deg + "poly")
|
||||
func xnmodp(n uint, poly uint64, deg uint) (uint64, uint64) {
|
||||
|
||||
var mod, mask, high, div uint64
|
||||
|
||||
if n < deg {
|
||||
div = 0
|
||||
return poly, div
|
||||
}
|
||||
mask = 1<<deg - 1
|
||||
poly &= mask
|
||||
mod = poly
|
||||
div = 1
|
||||
deg--
|
||||
n--
|
||||
for n > deg {
|
||||
high = (mod >> deg) & 1
|
||||
div = (div << 1) | high
|
||||
mod <<= 1
|
||||
if high != 0 {
|
||||
mod ^= poly
|
||||
}
|
||||
n--
|
||||
}
|
||||
return mod & mask, div
|
||||
}
|
||||
|
||||
func main() {
|
||||
w := new(bytes.Buffer)
|
||||
|
||||
fmt.Fprintf(w, "// autogenerated: do not edit!\n")
|
||||
fmt.Fprintf(w, "// generated from crc32/gen_const_ppc64le.go\n")
|
||||
fmt.Fprintln(w)
|
||||
fmt.Fprintf(w, "#include \"textflag.h\"\n")
|
||||
|
||||
// These are the polynomials supported in vector now.
|
||||
// If adding others, include the polynomial and a name
|
||||
// to identify it.
|
||||
|
||||
genCrc32ConstTable(w, 0xedb88320, "IEEE")
|
||||
genCrc32ConstTable(w, 0x82f63b78, "Cast")
|
||||
genCrc32ConstTable(w, 0xeb31d82e, "Koop")
|
||||
b := w.Bytes()
|
||||
|
||||
err := ioutil.WriteFile("crc32_table_ppc64le.s", b, 0666)
|
||||
if err != nil {
|
||||
fmt.Printf("can't write output: %s\n", err)
|
||||
}
|
||||
}
|
||||
|
||||
func genCrc32ConstTable(w *bytes.Buffer, poly uint32, polyid string) {
|
||||
|
||||
ref_poly := reflect_bits(uint64(poly), 32)
|
||||
fmt.Fprintf(w, "\n\t/* Reduce %d kbits to 1024 bits */\n", blocking*8)
|
||||
j := 0
|
||||
for i := (blocking * 8) - 1024; i > 0; i -= 1024 {
|
||||
a := reflect_bits(get_remainder(ref_poly, 32, uint(i)), 32) << 1
|
||||
b := reflect_bits(get_remainder(ref_poly, 32, uint(i+64)), 32) << 1
|
||||
|
||||
fmt.Fprintf(w, "\t/* x^%d mod p(x)%s, x^%d mod p(x)%s */\n", uint(i+64), "", uint(i), "")
|
||||
fmt.Fprintf(w, "DATA ·%sConst+%d(SB)/8,$0x%016x\n", polyid, j*8, b)
|
||||
fmt.Fprintf(w, "DATA ·%sConst+%d(SB)/8,$0x%016x\n", polyid, (j+1)*8, a)
|
||||
|
||||
j += 2
|
||||
fmt.Fprintf(w, "\n")
|
||||
}
|
||||
|
||||
for i := (1024 * 2) - 128; i >= 0; i -= 128 {
|
||||
a := reflect_bits(get_remainder(ref_poly, 32, uint(i+32)), 32)
|
||||
b := reflect_bits(get_remainder(ref_poly, 32, uint(i+64)), 32)
|
||||
c := reflect_bits(get_remainder(ref_poly, 32, uint(i+96)), 32)
|
||||
d := reflect_bits(get_remainder(ref_poly, 32, uint(i+128)), 32)
|
||||
|
||||
fmt.Fprintf(w, "\t/* x^%d mod p(x)%s, x^%d mod p(x)%s, x^%d mod p(x)%s, x^%d mod p(x)%s */\n", i+128, "", i+96, "", i+64, "", i+32, "")
|
||||
fmt.Fprintf(w, "DATA ·%sConst+%d(SB)/8,$0x%08x%08x\n", polyid, j*8, c, d)
|
||||
fmt.Fprintf(w, "DATA ·%sConst+%d(SB)/8,$0x%08x%08x\n", polyid, (j+1)*8, a, b)
|
||||
|
||||
j += 2
|
||||
fmt.Fprintf(w, "\n")
|
||||
}
|
||||
|
||||
fmt.Fprintf(w, "GLOBL ·%sConst(SB),RODATA,$4336\n", polyid)
|
||||
fmt.Fprintf(w, "\n /* Barrett constant m - (4^32)/n */\n")
|
||||
fmt.Fprintf(w, "DATA ·%sBarConst(SB)/8,$0x%016x\n", polyid, reflect_bits(get_quotient(ref_poly, 32, 64), 33))
|
||||
fmt.Fprintf(w, "DATA ·%sBarConst+8(SB)/8,$0x0000000000000000\n", polyid)
|
||||
fmt.Fprintf(w, "DATA ·%sBarConst+16(SB)/8,$0x%016x\n", polyid, reflect_bits((uint64(1)<<32)|ref_poly, 33)) // reflected?
|
||||
fmt.Fprintf(w, "DATA ·%sBarConst+24(SB)/8,$0x0000000000000000\n", polyid)
|
||||
fmt.Fprintf(w, "GLOBL ·%sBarConst(SB),RODATA,$32\n", polyid)
|
||||
}
|
Loading…
Reference in New Issue
Block a user