mirror of
https://github.com/golang/go
synced 2024-11-15 02:30:31 -07:00
hash/crc32: improve asm for ppc64SlicingUpdateBy8
Improvements are made in the assembler code which improves time and space by 9-10%. 1. ANDCC, followed by SLD is combined and replaced by CLRLSLDI. 2. MOVWZ can use an indexed load and eliminate an ADD instruction in some cases. Example: ADD R7,R10,R7 followed by MOVWZ 0(R7),R5 can be replaced with just MOVWZ (R7)(R10),R5. 3. Optimizations for the block after the "short" label includes the same MOVWZ use of indexed load, as well as other improvements. The gain from code changes can be seen as follows, generated by benchstat: goos: linux goarch: ppc64le pkg: hash/crc32 cpu: POWER10 | oldCrc.out | newCrc.out | | sec/op | sec/op vs base | CRC32/poly=IEEE/size=15/align=0-12 50.19n ± 1% 39.85n ± 0% -20.59% (p=0.002 n=6) CRC32/poly=IEEE/size=15/align=1-12 50.18n ± 1% 39.87n ± 0% -20.54% (p=0.002 n=6) CRC32/poly=IEEE/size=40/align=0-12 40.25n ± 0% 36.95n ± 0% -8.19% (p=0.002 n=6) CRC32/poly=IEEE/size=40/align=1-12 40.31n ± 0% 36.95n ± 0% -8.36% (p=0.002 n=6) CRC32/poly=IEEE/size=512/align=0-12 38.03n ± 0% 38.17n ± 0% +0.37% (p=0.002 n=6) CRC32/poly=IEEE/size=512/align=1-12 89.19n ± 1% 73.65n ± 0% -17.43% (p=0.002 n=6) CRC32/poly=IEEE/size=1kB/align=0-12 50.73n ± 7% 50.14n ± 0% -1.18% (p=0.002 n=6) CRC32/poly=IEEE/size=1kB/align=1-12 101.00n ± 37% 81.58n ± 0% -19.23% (p=0.002 n=6) CRC32/poly=IEEE/size=4kB/align=0-12 98.30n ± 45% 93.05n ± 0% -5.34% (p=0.043 n=6) CRC32/poly=IEEE/size=4kB/align=1-12 140.8n ± 0% 125.8n ± 0% -10.65% (p=0.002 n=6) CRC32/poly=IEEE/size=32kB/align=0-12 525.8n ± 0% 528.5n ± 0% +0.52% (p=0.011 n=6) CRC32/poly=IEEE/size=32kB/align=1-12 584.4n ± 1% 576.3n ± 0% -1.39% (p=0.002 n=6) geomean 90.51n 81.74n -9.69% | oldCrc.out | newCrc.out | | B/s | B/s vs base | CRC32/poly=IEEE/size=15/align=0-12 285.0Mi ± 1% 359.0Mi ± 0% +25.94% (p=0.002 n=6) CRC32/poly=IEEE/size=15/align=1-12 285.1Mi ± 1% 358.8Mi ± 0% +25.86% (p=0.002 n=6) CRC32/poly=IEEE/size=40/align=0-12 947.8Mi ± 0% 1032.3Mi ± 0% +8.91% (p=0.002 n=6) CRC32/poly=IEEE/size=40/align=1-12 946.2Mi ± 0% 1032.5Mi ± 0% +9.12% (p=0.002 n=6) CRC32/poly=IEEE/size=512/align=0-12 12.54Gi ± 0% 12.49Gi ± 0% -0.37% (p=0.002 n=6) CRC32/poly=IEEE/size=512/align=1-12 5.346Gi ± 1% 6.475Gi ± 0% +21.12% (p=0.002 n=6) CRC32/poly=IEEE/size=1kB/align=0-12 18.80Gi ± 7% 19.02Gi ± 0% +1.20% (p=0.002 n=6) CRC32/poly=IEEE/size=1kB/align=1-12 9.454Gi ± 27% 11.690Gi ± 0% +23.66% (p=0.002 n=6) CRC32/poly=IEEE/size=4kB/align=0-12 38.86Gi ± 31% 41.00Gi ± 0% +5.49% (p=0.041 n=6) CRC32/poly=IEEE/size=4kB/align=1-12 27.10Gi ± 0% 30.32Gi ± 0% +11.89% (p=0.002 n=6) CRC32/poly=IEEE/size=32kB/align=0-12 58.05Gi ± 0% 57.74Gi ± 0% -0.53% (p=0.009 n=6) CRC32/poly=IEEE/size=32kB/align=1-12 52.22Gi ± 1% 52.95Gi ± 0% +1.41% (p=0.002 n=6) geomean 6.074Gi 6.724Gi +10.70% Change-Id: I378c0e84e798656384a8009f4ac48b51614489b2 Cq-Include-Trybots: luci.golang.try:gotip-linux-ppc64_power10,gotip-linux-ppc64_power8,gotip-linux-ppc64le_power8,gotip-linux-ppc64le_power9,gotip-linux-ppc64le_power10 Reviewed-on: https://go-review.googlesource.com/c/go/+/582395 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com> Reviewed-by: Eli Bendersky <eliben@google.com>
This commit is contained in:
parent
ff0bc4669e
commit
ac174400f4
@ -63,67 +63,56 @@ loop:
|
||||
RLDICL $40,R9,$56,R17 // p[7]
|
||||
SLD $2,R17,R17 // p[7]*4
|
||||
RLDICL $40,R7,$56,R8 // crc>>24
|
||||
ADD R17,R10,R17 // &tab[0][p[7]]
|
||||
SLD $2,R8,R8 // crc>>24*4
|
||||
RLDICL $48,R9,$56,R18 // p[6]
|
||||
SLD $2,R18,R18 // p[6]*4
|
||||
MOVWZ (R10)(R17),R21 // tab[0][p[7]]
|
||||
ADD $1024,R10,R10 // tab[1]
|
||||
MOVWZ 0(R17),R21 // tab[0][p[7]]
|
||||
RLDICL $56,R9,$56,R19 // p[5]
|
||||
ADD R10,R18,R18 // &tab[1][p[6]]
|
||||
SLD $2,R19,R19 // p[5]*4:1
|
||||
MOVWZ 0(R18),R22 // tab[1][p[6]]
|
||||
MOVWZ (R10)(R18),R22 // tab[1][p[6]]
|
||||
ADD $1024,R10,R10 // tab[2]
|
||||
XOR R21,R22,R21 // xor done R22
|
||||
ADD R19,R10,R19 // &tab[2][p[5]]
|
||||
ANDCC $255,R9,R20 // p[4] ??
|
||||
SLD $2,R20,R20 // p[4]*4
|
||||
MOVWZ 0(R19),R23 // tab[2][p[5]]
|
||||
CLRLSLDI $56,R9,$2,R20
|
||||
MOVWZ (R10)(R19),R23 // tab[2][p[5]]
|
||||
ADD $1024,R10,R10 // &tab[3]
|
||||
ADD R20,R10,R20 // tab[3][p[4]]
|
||||
XOR R21,R23,R21 // xor done R23
|
||||
MOVWZ (R10)(R20),R24 // tab[3][p[4]]
|
||||
ADD $1024,R10,R10 // &tab[4]
|
||||
MOVWZ 0(R20),R24 // tab[3][p[4]]
|
||||
ADD R10,R8,R23 // &tab[4][crc>>24]
|
||||
XOR R21,R24,R21 // xor done R24
|
||||
MOVWZ 0(R23),R25 // tab[4][crc>>24]
|
||||
MOVWZ (R10)(R8),R25 // tab[4][crc>>24]
|
||||
RLDICL $48,R7,$56,R24 // crc>>16&0xFF
|
||||
XOR R21,R25,R21 // xor done R25
|
||||
ADD $1024,R10,R10 // &tab[5]
|
||||
SLD $2,R24,R24 // crc>>16&0xFF*4
|
||||
ADD R24,R10,R24 // &tab[5][crc>>16&0xFF]
|
||||
MOVWZ 0(R24),R26 // tab[5][crc>>16&0xFF]
|
||||
MOVWZ (R10)(R24),R26 // tab[5][crc>>16&0xFF]
|
||||
XOR R21,R26,R21 // xor done R26
|
||||
RLDICL $56,R7,$56,R25 // crc>>8
|
||||
ADD $1024,R10,R10 // &tab[6]
|
||||
SLD $2,R25,R25 // crc>>8&FF*2
|
||||
ADD R25,R10,R25 // &tab[6][crc>>8&0xFF]
|
||||
MOVBZ R7,R26 // crc&0xFF
|
||||
MOVWZ (R10)(R25),R27 // tab[6][crc>>8&0xFF]
|
||||
ADD $1024,R10,R10 // &tab[7]
|
||||
MOVWZ 0(R25),R27 // tab[6][crc>>8&0xFF]
|
||||
SLD $2,R26,R26 // crc&0xFF*2
|
||||
XOR R21,R27,R21 // xor done R27
|
||||
ADD R26,R10,R26 // &tab[7][crc&0xFF]
|
||||
ADD $8,R5 // p = p[8:]
|
||||
MOVWZ 0(R26),R28 // tab[7][crc&0xFF]
|
||||
MOVWZ (R10)(R26),R28 // tab[7][crc&0xFF]
|
||||
XOR R21,R28,R21 // xor done R28
|
||||
MOVWZ R21,R7 // crc for next round
|
||||
BC 16,0,loop // next 8 bytes
|
||||
BDNZ loop
|
||||
ANDCC $7,R6,R8 // any leftover bytes
|
||||
BEQ done // none --> done
|
||||
MOVD R8,CTR // byte count
|
||||
PCALIGN $16 // align short loop
|
||||
short:
|
||||
MOVBZ 0(R5),R8 // get v
|
||||
MOVBZ R7,R9 // byte(crc) -> R8 BE vs LE?
|
||||
XOR R8,R7,R8 // byte(crc)^v -> R8
|
||||
RLDIC $2,R8,$54,R8 // rldicl r8,r8,2,22
|
||||
SRD $8,R7,R14 // crc>>8
|
||||
XOR R8,R9,R8 // byte(crc)^v -> R8
|
||||
ADD $1,R5 // ptr to next v
|
||||
SLD $2,R8 // convert index-> bytes
|
||||
ADD R8,R4,R9 // &tab[byte(crc)^v]
|
||||
MOVWZ 0(R9),R10 // tab[byte(crc)^v]
|
||||
MOVWZ (R4)(R8),R10
|
||||
ADD $1,R5
|
||||
XOR R10,R14,R7 // loop crc in R7
|
||||
BC 16,0,short
|
||||
BDNZ short
|
||||
done:
|
||||
NOR R7,R7,R7 // ^crc
|
||||
MOVW R7,ret+40(FP) // return crc
|
||||
@ -333,7 +322,7 @@ cool_top:
|
||||
LVX (R4+off112),V23 // next in buffer
|
||||
|
||||
ADD $128,R4 // bump up buffer pointer
|
||||
BC 16,0,cool_top // are we done?
|
||||
BDNZ cool_top // are we done?
|
||||
|
||||
first_cool_down:
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user