From ac174400f460e9b577079e8606439e0bae62adb0 Mon Sep 17 00:00:00 2001 From: Jayanth Krishnamurthy Date: Mon, 29 Apr 2024 12:37:27 -0500 Subject: [PATCH] hash/crc32: improve asm for ppc64SlicingUpdateBy8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Improvements are made in the assembler code which improves time and space by 9-10%. 1. ANDCC, followed by SLD is combined and replaced by CLRLSLDI. 2. MOVWZ can use an indexed load and eliminate an ADD instruction in some cases. Example: ADD R7,R10,R7 followed by MOVWZ 0(R7),R5 can be replaced with just MOVWZ (R7)(R10),R5. 3. Optimizations for the block after the "short" label includes the same MOVWZ use of indexed load, as well as other improvements. The gain from code changes can be seen as follows, generated by benchstat: goos: linux goarch: ppc64le pkg: hash/crc32 cpu: POWER10 | oldCrc.out | newCrc.out | | sec/op | sec/op vs base | CRC32/poly=IEEE/size=15/align=0-12 50.19n ± 1% 39.85n ± 0% -20.59% (p=0.002 n=6) CRC32/poly=IEEE/size=15/align=1-12 50.18n ± 1% 39.87n ± 0% -20.54% (p=0.002 n=6) CRC32/poly=IEEE/size=40/align=0-12 40.25n ± 0% 36.95n ± 0% -8.19% (p=0.002 n=6) CRC32/poly=IEEE/size=40/align=1-12 40.31n ± 0% 36.95n ± 0% -8.36% (p=0.002 n=6) CRC32/poly=IEEE/size=512/align=0-12 38.03n ± 0% 38.17n ± 0% +0.37% (p=0.002 n=6) CRC32/poly=IEEE/size=512/align=1-12 89.19n ± 1% 73.65n ± 0% -17.43% (p=0.002 n=6) CRC32/poly=IEEE/size=1kB/align=0-12 50.73n ± 7% 50.14n ± 0% -1.18% (p=0.002 n=6) CRC32/poly=IEEE/size=1kB/align=1-12 101.00n ± 37% 81.58n ± 0% -19.23% (p=0.002 n=6) CRC32/poly=IEEE/size=4kB/align=0-12 98.30n ± 45% 93.05n ± 0% -5.34% (p=0.043 n=6) CRC32/poly=IEEE/size=4kB/align=1-12 140.8n ± 0% 125.8n ± 0% -10.65% (p=0.002 n=6) CRC32/poly=IEEE/size=32kB/align=0-12 525.8n ± 0% 528.5n ± 0% +0.52% (p=0.011 n=6) CRC32/poly=IEEE/size=32kB/align=1-12 584.4n ± 1% 576.3n ± 0% -1.39% (p=0.002 n=6) geomean 90.51n 81.74n -9.69% | oldCrc.out | newCrc.out | | B/s | B/s vs base | CRC32/poly=IEEE/size=15/align=0-12 285.0Mi ± 1% 359.0Mi ± 0% +25.94% (p=0.002 n=6) CRC32/poly=IEEE/size=15/align=1-12 285.1Mi ± 1% 358.8Mi ± 0% +25.86% (p=0.002 n=6) CRC32/poly=IEEE/size=40/align=0-12 947.8Mi ± 0% 1032.3Mi ± 0% +8.91% (p=0.002 n=6) CRC32/poly=IEEE/size=40/align=1-12 946.2Mi ± 0% 1032.5Mi ± 0% +9.12% (p=0.002 n=6) CRC32/poly=IEEE/size=512/align=0-12 12.54Gi ± 0% 12.49Gi ± 0% -0.37% (p=0.002 n=6) CRC32/poly=IEEE/size=512/align=1-12 5.346Gi ± 1% 6.475Gi ± 0% +21.12% (p=0.002 n=6) CRC32/poly=IEEE/size=1kB/align=0-12 18.80Gi ± 7% 19.02Gi ± 0% +1.20% (p=0.002 n=6) CRC32/poly=IEEE/size=1kB/align=1-12 9.454Gi ± 27% 11.690Gi ± 0% +23.66% (p=0.002 n=6) CRC32/poly=IEEE/size=4kB/align=0-12 38.86Gi ± 31% 41.00Gi ± 0% +5.49% (p=0.041 n=6) CRC32/poly=IEEE/size=4kB/align=1-12 27.10Gi ± 0% 30.32Gi ± 0% +11.89% (p=0.002 n=6) CRC32/poly=IEEE/size=32kB/align=0-12 58.05Gi ± 0% 57.74Gi ± 0% -0.53% (p=0.009 n=6) CRC32/poly=IEEE/size=32kB/align=1-12 52.22Gi ± 1% 52.95Gi ± 0% +1.41% (p=0.002 n=6) geomean 6.074Gi 6.724Gi +10.70% Change-Id: I378c0e84e798656384a8009f4ac48b51614489b2 Cq-Include-Trybots: luci.golang.try:gotip-linux-ppc64_power10,gotip-linux-ppc64_power8,gotip-linux-ppc64le_power8,gotip-linux-ppc64le_power9,gotip-linux-ppc64le_power10 Reviewed-on: https://go-review.googlesource.com/c/go/+/582395 LUCI-TryBot-Result: Go LUCI Reviewed-by: David Chase Reviewed-by: Lynn Boger Reviewed-by: Eli Bendersky --- src/hash/crc32/crc32_ppc64le.s | 53 ++++++++++++++-------------------- 1 file changed, 21 insertions(+), 32 deletions(-) diff --git a/src/hash/crc32/crc32_ppc64le.s b/src/hash/crc32/crc32_ppc64le.s index 84ef213312..fb7c783f93 100644 --- a/src/hash/crc32/crc32_ppc64le.s +++ b/src/hash/crc32/crc32_ppc64le.s @@ -63,67 +63,56 @@ loop: RLDICL $40,R9,$56,R17 // p[7] SLD $2,R17,R17 // p[7]*4 RLDICL $40,R7,$56,R8 // crc>>24 - ADD R17,R10,R17 // &tab[0][p[7]] SLD $2,R8,R8 // crc>>24*4 RLDICL $48,R9,$56,R18 // p[6] SLD $2,R18,R18 // p[6]*4 + MOVWZ (R10)(R17),R21 // tab[0][p[7]] ADD $1024,R10,R10 // tab[1] - MOVWZ 0(R17),R21 // tab[0][p[7]] RLDICL $56,R9,$56,R19 // p[5] - ADD R10,R18,R18 // &tab[1][p[6]] SLD $2,R19,R19 // p[5]*4:1 - MOVWZ 0(R18),R22 // tab[1][p[6]] + MOVWZ (R10)(R18),R22 // tab[1][p[6]] ADD $1024,R10,R10 // tab[2] XOR R21,R22,R21 // xor done R22 - ADD R19,R10,R19 // &tab[2][p[5]] - ANDCC $255,R9,R20 // p[4] ?? - SLD $2,R20,R20 // p[4]*4 - MOVWZ 0(R19),R23 // tab[2][p[5]] + CLRLSLDI $56,R9,$2,R20 + MOVWZ (R10)(R19),R23 // tab[2][p[5]] ADD $1024,R10,R10 // &tab[3] - ADD R20,R10,R20 // tab[3][p[4]] XOR R21,R23,R21 // xor done R23 - ADD $1024,R10,R10 // &tab[4] - MOVWZ 0(R20),R24 // tab[3][p[4]] - ADD R10,R8,R23 // &tab[4][crc>>24] + MOVWZ (R10)(R20),R24 // tab[3][p[4]] + ADD $1024,R10,R10 // &tab[4] XOR R21,R24,R21 // xor done R24 - MOVWZ 0(R23),R25 // tab[4][crc>>24] + MOVWZ (R10)(R8),R25 // tab[4][crc>>24] RLDICL $48,R7,$56,R24 // crc>>16&0xFF XOR R21,R25,R21 // xor done R25 ADD $1024,R10,R10 // &tab[5] SLD $2,R24,R24 // crc>>16&0xFF*4 - ADD R24,R10,R24 // &tab[5][crc>>16&0xFF] - MOVWZ 0(R24),R26 // tab[5][crc>>16&0xFF] + MOVWZ (R10)(R24),R26 // tab[5][crc>>16&0xFF] XOR R21,R26,R21 // xor done R26 RLDICL $56,R7,$56,R25 // crc>>8 ADD $1024,R10,R10 // &tab[6] SLD $2,R25,R25 // crc>>8&FF*2 - ADD R25,R10,R25 // &tab[6][crc>>8&0xFF] MOVBZ R7,R26 // crc&0xFF - ADD $1024,R10,R10 // &tab[7] - MOVWZ 0(R25),R27 // tab[6][crc>>8&0xFF] + MOVWZ (R10)(R25),R27 // tab[6][crc>>8&0xFF] + ADD $1024,R10,R10 // &tab[7] SLD $2,R26,R26 // crc&0xFF*2 XOR R21,R27,R21 // xor done R27 - ADD R26,R10,R26 // &tab[7][crc&0xFF] ADD $8,R5 // p = p[8:] - MOVWZ 0(R26),R28 // tab[7][crc&0xFF] + MOVWZ (R10)(R26),R28 // tab[7][crc&0xFF] XOR R21,R28,R21 // xor done R28 MOVWZ R21,R7 // crc for next round - BC 16,0,loop // next 8 bytes + BDNZ loop ANDCC $7,R6,R8 // any leftover bytes BEQ done // none --> done MOVD R8,CTR // byte count PCALIGN $16 // align short loop short: - MOVBZ 0(R5),R8 // get v - MOVBZ R7,R9 // byte(crc) -> R8 BE vs LE? - SRD $8,R7,R14 // crc>>8 - XOR R8,R9,R8 // byte(crc)^v -> R8 - ADD $1,R5 // ptr to next v - SLD $2,R8 // convert index-> bytes - ADD R8,R4,R9 // &tab[byte(crc)^v] - MOVWZ 0(R9),R10 // tab[byte(crc)^v] - XOR R10,R14,R7 // loop crc in R7 - BC 16,0,short + MOVBZ 0(R5),R8 // get v + XOR R8,R7,R8 // byte(crc)^v -> R8 + RLDIC $2,R8,$54,R8 // rldicl r8,r8,2,22 + SRD $8,R7,R14 // crc>>8 + MOVWZ (R4)(R8),R10 + ADD $1,R5 + XOR R10,R14,R7 // loop crc in R7 + BDNZ short done: NOR R7,R7,R7 // ^crc MOVW R7,ret+40(FP) // return crc @@ -333,7 +322,7 @@ cool_top: LVX (R4+off112),V23 // next in buffer ADD $128,R4 // bump up buffer pointer - BC 16,0,cool_top // are we done? + BDNZ cool_top // are we done? first_cool_down: