1
0
mirror of https://github.com/golang/go synced 2024-11-15 04:40:28 -07:00

hash/crc32: improve asm for ppc64SlicingUpdateBy8

Improvements are made in the assembler code which improves  time and
space by 9-10%.
1. ANDCC, followed by SLD is combined and replaced by CLRLSLDI.
2. MOVWZ can use an indexed load and eliminate an ADD instruction in some cases.
	Example: ADD R7,R10,R7 followed by MOVWZ 0(R7),R5 can be replaced with just MOVWZ (R7)(R10),R5.
3. Optimizations for the block after the "short" label includes the same MOVWZ use of indexed load, as well as other improvements.

The gain from code  changes can be seen as follows, generated by
benchstat:

goos: linux
goarch: ppc64le
pkg: hash/crc32
cpu: POWER10
                                     |  oldCrc.out   |  newCrc.out                     	 |
                                     |    sec/op     |   sec/op     vs base            	 |
CRC32/poly=IEEE/size=15/align=0-12      50.19n ±  1%   39.85n ± 0%  -20.59% (p=0.002 n=6)
CRC32/poly=IEEE/size=15/align=1-12      50.18n ±  1%   39.87n ± 0%  -20.54% (p=0.002 n=6)
CRC32/poly=IEEE/size=40/align=0-12      40.25n ±  0%   36.95n ± 0%   -8.19% (p=0.002 n=6)
CRC32/poly=IEEE/size=40/align=1-12      40.31n ±  0%   36.95n ± 0%   -8.36% (p=0.002 n=6)
CRC32/poly=IEEE/size=512/align=0-12     38.03n ±  0%   38.17n ± 0%   +0.37% (p=0.002 n=6)
CRC32/poly=IEEE/size=512/align=1-12     89.19n ±  1%   73.65n ± 0%  -17.43% (p=0.002 n=6)
CRC32/poly=IEEE/size=1kB/align=0-12     50.73n ±  7%   50.14n ± 0%   -1.18% (p=0.002 n=6)
CRC32/poly=IEEE/size=1kB/align=1-12    101.00n ± 37%   81.58n ± 0%  -19.23% (p=0.002 n=6)
CRC32/poly=IEEE/size=4kB/align=0-12     98.30n ± 45%   93.05n ± 0%   -5.34% (p=0.043 n=6)
CRC32/poly=IEEE/size=4kB/align=1-12     140.8n ±  0%   125.8n ± 0%  -10.65% (p=0.002 n=6)
CRC32/poly=IEEE/size=32kB/align=0-12    525.8n ±  0%   528.5n ± 0%   +0.52% (p=0.011 n=6)
CRC32/poly=IEEE/size=32kB/align=1-12    584.4n ±  1%   576.3n ± 0%   -1.39% (p=0.002 n=6)
geomean                                 90.51n         81.74n        -9.69%

	                             |    oldCrc.out |    newCrc.out           		  |
                                     |      B/s      |     B/s       vs base    	  |
CRC32/poly=IEEE/size=15/align=0-12     285.0Mi ±  1%    359.0Mi ± 0%  +25.94% (p=0.002 n=6)
CRC32/poly=IEEE/size=15/align=1-12     285.1Mi ±  1%    358.8Mi ± 0%  +25.86% (p=0.002 n=6)
CRC32/poly=IEEE/size=40/align=0-12     947.8Mi ±  0%   1032.3Mi ± 0%   +8.91% (p=0.002 n=6)
CRC32/poly=IEEE/size=40/align=1-12     946.2Mi ±  0%   1032.5Mi ± 0%   +9.12% (p=0.002 n=6)
CRC32/poly=IEEE/size=512/align=0-12    12.54Gi ±  0%    12.49Gi ± 0%   -0.37% (p=0.002 n=6)
CRC32/poly=IEEE/size=512/align=1-12    5.346Gi ±  1%    6.475Gi ± 0%  +21.12% (p=0.002 n=6)
CRC32/poly=IEEE/size=1kB/align=0-12    18.80Gi ±  7%    19.02Gi ± 0%   +1.20% (p=0.002 n=6)
CRC32/poly=IEEE/size=1kB/align=1-12    9.454Gi ± 27%   11.690Gi ± 0%  +23.66% (p=0.002 n=6)
CRC32/poly=IEEE/size=4kB/align=0-12    38.86Gi ± 31%    41.00Gi ± 0%   +5.49% (p=0.041 n=6)
CRC32/poly=IEEE/size=4kB/align=1-12    27.10Gi ±  0%    30.32Gi ± 0%  +11.89% (p=0.002 n=6)
CRC32/poly=IEEE/size=32kB/align=0-12   58.05Gi ±  0%    57.74Gi ± 0%   -0.53% (p=0.009 n=6)
CRC32/poly=IEEE/size=32kB/align=1-12   52.22Gi ±  1%    52.95Gi ± 0%   +1.41% (p=0.002 n=6)
geomean                                6.074Gi          6.724Gi       +10.70%

Change-Id: I378c0e84e798656384a8009f4ac48b51614489b2
Cq-Include-Trybots: luci.golang.try:gotip-linux-ppc64_power10,gotip-linux-ppc64_power8,gotip-linux-ppc64le_power8,gotip-linux-ppc64le_power9,gotip-linux-ppc64le_power10
Reviewed-on: https://go-review.googlesource.com/c/go/+/582395
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Eli Bendersky <eliben@google.com>
This commit is contained in:
Jayanth Krishnamurthy 2024-04-29 12:37:27 -05:00 committed by Lynn Boger
parent ff0bc4669e
commit ac174400f4

View File

@ -63,67 +63,56 @@ loop:
RLDICL $40,R9,$56,R17 // p[7] RLDICL $40,R9,$56,R17 // p[7]
SLD $2,R17,R17 // p[7]*4 SLD $2,R17,R17 // p[7]*4
RLDICL $40,R7,$56,R8 // crc>>24 RLDICL $40,R7,$56,R8 // crc>>24
ADD R17,R10,R17 // &tab[0][p[7]]
SLD $2,R8,R8 // crc>>24*4 SLD $2,R8,R8 // crc>>24*4
RLDICL $48,R9,$56,R18 // p[6] RLDICL $48,R9,$56,R18 // p[6]
SLD $2,R18,R18 // p[6]*4 SLD $2,R18,R18 // p[6]*4
MOVWZ (R10)(R17),R21 // tab[0][p[7]]
ADD $1024,R10,R10 // tab[1] ADD $1024,R10,R10 // tab[1]
MOVWZ 0(R17),R21 // tab[0][p[7]]
RLDICL $56,R9,$56,R19 // p[5] RLDICL $56,R9,$56,R19 // p[5]
ADD R10,R18,R18 // &tab[1][p[6]]
SLD $2,R19,R19 // p[5]*4:1 SLD $2,R19,R19 // p[5]*4:1
MOVWZ 0(R18),R22 // tab[1][p[6]] MOVWZ (R10)(R18),R22 // tab[1][p[6]]
ADD $1024,R10,R10 // tab[2] ADD $1024,R10,R10 // tab[2]
XOR R21,R22,R21 // xor done R22 XOR R21,R22,R21 // xor done R22
ADD R19,R10,R19 // &tab[2][p[5]] CLRLSLDI $56,R9,$2,R20
ANDCC $255,R9,R20 // p[4] ?? MOVWZ (R10)(R19),R23 // tab[2][p[5]]
SLD $2,R20,R20 // p[4]*4
MOVWZ 0(R19),R23 // tab[2][p[5]]
ADD $1024,R10,R10 // &tab[3] ADD $1024,R10,R10 // &tab[3]
ADD R20,R10,R20 // tab[3][p[4]]
XOR R21,R23,R21 // xor done R23 XOR R21,R23,R21 // xor done R23
MOVWZ (R10)(R20),R24 // tab[3][p[4]]
ADD $1024,R10,R10 // &tab[4] ADD $1024,R10,R10 // &tab[4]
MOVWZ 0(R20),R24 // tab[3][p[4]]
ADD R10,R8,R23 // &tab[4][crc>>24]
XOR R21,R24,R21 // xor done R24 XOR R21,R24,R21 // xor done R24
MOVWZ 0(R23),R25 // tab[4][crc>>24] MOVWZ (R10)(R8),R25 // tab[4][crc>>24]
RLDICL $48,R7,$56,R24 // crc>>16&0xFF RLDICL $48,R7,$56,R24 // crc>>16&0xFF
XOR R21,R25,R21 // xor done R25 XOR R21,R25,R21 // xor done R25
ADD $1024,R10,R10 // &tab[5] ADD $1024,R10,R10 // &tab[5]
SLD $2,R24,R24 // crc>>16&0xFF*4 SLD $2,R24,R24 // crc>>16&0xFF*4
ADD R24,R10,R24 // &tab[5][crc>>16&0xFF] MOVWZ (R10)(R24),R26 // tab[5][crc>>16&0xFF]
MOVWZ 0(R24),R26 // tab[5][crc>>16&0xFF]
XOR R21,R26,R21 // xor done R26 XOR R21,R26,R21 // xor done R26
RLDICL $56,R7,$56,R25 // crc>>8 RLDICL $56,R7,$56,R25 // crc>>8
ADD $1024,R10,R10 // &tab[6] ADD $1024,R10,R10 // &tab[6]
SLD $2,R25,R25 // crc>>8&FF*2 SLD $2,R25,R25 // crc>>8&FF*2
ADD R25,R10,R25 // &tab[6][crc>>8&0xFF]
MOVBZ R7,R26 // crc&0xFF MOVBZ R7,R26 // crc&0xFF
MOVWZ (R10)(R25),R27 // tab[6][crc>>8&0xFF]
ADD $1024,R10,R10 // &tab[7] ADD $1024,R10,R10 // &tab[7]
MOVWZ 0(R25),R27 // tab[6][crc>>8&0xFF]
SLD $2,R26,R26 // crc&0xFF*2 SLD $2,R26,R26 // crc&0xFF*2
XOR R21,R27,R21 // xor done R27 XOR R21,R27,R21 // xor done R27
ADD R26,R10,R26 // &tab[7][crc&0xFF]
ADD $8,R5 // p = p[8:] ADD $8,R5 // p = p[8:]
MOVWZ 0(R26),R28 // tab[7][crc&0xFF] MOVWZ (R10)(R26),R28 // tab[7][crc&0xFF]
XOR R21,R28,R21 // xor done R28 XOR R21,R28,R21 // xor done R28
MOVWZ R21,R7 // crc for next round MOVWZ R21,R7 // crc for next round
BC 16,0,loop // next 8 bytes BDNZ loop
ANDCC $7,R6,R8 // any leftover bytes ANDCC $7,R6,R8 // any leftover bytes
BEQ done // none --> done BEQ done // none --> done
MOVD R8,CTR // byte count MOVD R8,CTR // byte count
PCALIGN $16 // align short loop PCALIGN $16 // align short loop
short: short:
MOVBZ 0(R5),R8 // get v MOVBZ 0(R5),R8 // get v
MOVBZ R7,R9 // byte(crc) -> R8 BE vs LE? XOR R8,R7,R8 // byte(crc)^v -> R8
RLDIC $2,R8,$54,R8 // rldicl r8,r8,2,22
SRD $8,R7,R14 // crc>>8 SRD $8,R7,R14 // crc>>8
XOR R8,R9,R8 // byte(crc)^v -> R8 MOVWZ (R4)(R8),R10
ADD $1,R5 // ptr to next v ADD $1,R5
SLD $2,R8 // convert index-> bytes
ADD R8,R4,R9 // &tab[byte(crc)^v]
MOVWZ 0(R9),R10 // tab[byte(crc)^v]
XOR R10,R14,R7 // loop crc in R7 XOR R10,R14,R7 // loop crc in R7
BC 16,0,short BDNZ short
done: done:
NOR R7,R7,R7 // ^crc NOR R7,R7,R7 // ^crc
MOVW R7,ret+40(FP) // return crc MOVW R7,ret+40(FP) // return crc
@ -333,7 +322,7 @@ cool_top:
LVX (R4+off112),V23 // next in buffer LVX (R4+off112),V23 // next in buffer
ADD $128,R4 // bump up buffer pointer ADD $128,R4 // bump up buffer pointer
BC 16,0,cool_top // are we done? BDNZ cool_top // are we done?
first_cool_down: first_cool_down: