1
0
mirror of https://github.com/golang/go synced 2024-11-17 02:14:42 -07:00

runtime: improve memclr on ppc64x

This improves performance for memclr for sizes >= 64 and < 512 by
unrolling the loop to clear 64 bytes at a time, whereas before it was
doing 32 bytes.

On a power9, the improvement is:

Memclr/64       6.07ns ± 0%    5.17ns ± 0%  -14.86%  (p=1.000 n=1+1)
Memclr/256      11.8ns ± 0%     8.3ns ± 0%  -30.10%  (p=1.000 n=1+1)

GoMemclr/64     5.58ns ± 0%    5.02ns ± 0%  -10.04%  (p=1.000 n=1+1)
GoMemclr/256    12.0ns ± 0%     8.8ns ± 0%  -26.62%  (p=1.000 n=1+1)

Change-Id: I929389ae9e50128cba81e0c412e7ba431da7facc
Reviewed-on: https://go-review.googlesource.com/c/go/+/399895
Reviewed-by: Cherry Mui <cherryyz@google.com>
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@google.com>
This commit is contained in:
Lynn Boger 2022-04-12 09:37:31 -05:00
parent 740a490f71
commit 91b9915d3f

View File

@ -52,37 +52,50 @@ byte4:
BR zero512xsetup // ptr should now be 8 byte aligned
under512:
MOVD R6, CTR // R6 = number of double words
SRDCC $2, R6, R7 // 32 byte chunks?
BNE zero32setup
// Clear double words
zero8:
MOVD R0, 0(R3) // double word
ADD $8, R3
ADD $-8, R4
BC 16, 0, zero8 // dec ctr, br zero8 if ctr not 0
BR nozerolarge // handle leftovers
// Prepare to clear 32 bytes at a time.
zero32setup:
DCBTST (R3) // prepare data cache
SRDCC $3, R6, R7 // 64 byte chunks?
XXLXOR VS32, VS32, VS32 // clear VS32 (V0)
MOVD R7, CTR // number of 32 byte chunks
MOVD $16, R8
BEQ lt64gt8
zero32:
// Prepare to clear 64 bytes at a time.
zero64setup:
DCBTST (R3) // prepare data cache
MOVD R7, CTR // number of 64 byte chunks
MOVD $16, R8
MOVD $32, R16
MOVD $48, R17
zero64:
STXVD2X VS32, (R3+R0) // store 16 bytes
STXVD2X VS32, (R3+R8)
ADD $32, R3
ADD $-32, R4
BC 16, 0, zero32 // dec ctr, br zero32 if ctr not 0
RLDCLCC $61, R4, $3, R6 // remaining doublewords
STXVD2X VS32, (R3+R16)
STXVD2X VS32, (R3+R17)
ADD $64, R3
ADD $-64, R4
BDNZ zero64 // dec ctr, br zero64 if ctr not 0
SRDCC $3, R4, R6 // remaining doublewords
BEQ nozerolarge
MOVD R6, CTR // set up the CTR for doublewords
BR zero8
lt64gt8:
CMP R4, $32
BLT lt32gt8
MOVD $16, R8
STXVD2X VS32, (R3+R0)
STXVD2X VS32, (R3+R8)
ADD $-32, R4
ADD $32, R3
lt32gt8:
CMP R4, $16
BLT lt16gt8
STXVD2X VS32, (R3+R0)
ADD $16, R3
ADD $-16, R4
lt16gt8:
CMP R4, $8
BLT nozerolarge
MOVD R0, 0(R3)
ADD $8, R3
ADD $-8, R4
nozerolarge:
ANDCC $7, R4, R5 // any remaining bytes
@ -94,7 +107,7 @@ zerotail:
zerotailloop:
MOVB R0, 0(R3) // clear single bytes
ADD $1, R3
BC 16, 0, zerotailloop // dec ctr, br zerotailloop if ctr not 0
BDNZ zerotailloop // dec ctr, br zerotailloop if ctr not 0
RET
zero512xsetup: // 512 chunk with extra needed
@ -119,7 +132,7 @@ zero512preloop: // clear up to 128 alignment
STXVD2X VS32, (R3+R0) // clear 16 bytes
ADD $16, R3 // update ptr
ADD $-16, R4 // dec count
BC 16, 0, zero512preloop
BDNZ zero512preloop
zero512setup: // setup for dcbz loop
CMP R4, $512 // check if at least 512
@ -129,6 +142,7 @@ zero512setup: // setup for dcbz loop
MOVD $128, R9 // index regs for 128 bytes
MOVD $256, R10
MOVD $384, R11
PCALIGN $32
zero512:
DCBZ (R3+R0) // clear first chunk
@ -136,8 +150,8 @@ zero512:
DCBZ (R3+R10) // clear third chunk
DCBZ (R3+R11) // clear fourth chunk
ADD $512, R3
ADD $-512, R4
BC 16, 0, zero512
BDNZ zero512
ANDCC $511, R4
remain:
CMP R4, $128 // check if 128 byte chunks left
@ -150,16 +164,11 @@ remain:
smaller:
ANDCC $127, R4, R7 // find leftovers
BEQ done
CMP R7, $64 // more than 64, do 32 at a time
BLT zero8setup // less than 64, do 8 at a time
SRD $5, R7, R7 // set up counter for 32
BR zero32setup
zero8setup:
SRDCC $3, R7, R7 // less than 8 bytes
BEQ nozerolarge
MOVD R7, CTR
BR zero8
CMP R7, $64 // more than 64, do 64 at a time
XXLXOR VS32, VS32, VS32
BLT lt64gt8 // less than 64
SRD $6, R7, R7 // set up counter for 64
BR zero64setup
done:
RET