mirror of
https://github.com/golang/go
synced 2024-11-13 19:00:25 -07:00
runtime: improve performance of memclr, memmove on ppc64x
This improves the asm implementations for memmove and memclr on ppc64x through use of vsx loads and stores when size is >= 32 bytes. For memclr, dcbz is used when the size is >= 512 and aligned to 128. Memclr/64 13.3ns ± 0% 10.7ns ± 0% -19.55% (p=0.000 n=8+7) Memclr/96 14.9ns ± 0% 11.4ns ± 0% -23.49% (p=0.000 n=8+8) Memclr/128 16.3ns ± 0% 12.3ns ± 0% -24.54% (p=0.000 n=8+8) Memclr/160 17.3ns ± 0% 13.0ns ± 0% -24.86% (p=0.000 n=8+8) Memclr/256 20.0ns ± 0% 15.3ns ± 0% -23.62% (p=0.000 n=8+8) Memclr/512 34.2ns ± 0% 10.2ns ± 0% -70.20% (p=0.000 n=8+8) Memclr/4096 178ns ± 0% 23ns ± 0% -87.13% (p=0.000 n=8+8) Memclr/65536 2.67µs ± 0% 0.30µs ± 0% -88.89% (p=0.000 n=7+8) Memclr/1M 43.2µs ± 0% 10.0µs ± 0% -76.85% (p=0.000 n=8+8) Memclr/4M 173µs ± 0% 40µs ± 0% -76.88% (p=0.000 n=8+8) Memclr/8M 349µs ± 0% 82µs ± 0% -76.58% (p=0.000 n=8+8) Memclr/16M 701µs ± 7% 672µs ± 0% -4.05% (p=0.040 n=8+7) Memclr/64M 2.70ms ± 0% 2.67ms ± 0% -0.96% (p=0.000 n=8+7) Memmove/32 6.59ns ± 0% 5.84ns ± 0% -11.34% (p=0.029 n=4+4) Memmove/64 7.91ns ± 0% 6.97ns ± 0% -11.92% (p=0.029 n=4+4) Memmove/128 10.5ns ± 0% 8.8ns ± 0% -16.24% (p=0.029 n=4+4) Memmove/256 21.0ns ± 0% 12.9ns ± 0% -38.57% (p=0.029 n=4+4) Memmove/512 28.4ns ± 0% 26.2ns ± 0% -7.75% (p=0.029 n=4+4) Memmove/1024 48.2ns ± 1% 39.4ns ± 0% -18.26% (p=0.029 n=4+4) Memmove/2048 85.4ns ± 0% 69.0ns ± 0% -19.20% (p=0.029 n=4+4) Memmove/4096 159ns ± 0% 128ns ± 0% -19.50% (p=0.029 n=4+4) Change-Id: I8c1adf88790845bf31444a15249456006eb5bf8b Reviewed-on: https://go-review.googlesource.com/c/141217 Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Munday <mike.munday@ibm.com>
This commit is contained in:
parent
e1978a2d7a
commit
aa9bcea390
@ -14,34 +14,68 @@ TEXT runtime·memclrNoHeapPointers(SB), NOSPLIT|NOFRAME, $0-16
|
|||||||
// Determine if there are doublewords to clear
|
// Determine if there are doublewords to clear
|
||||||
check:
|
check:
|
||||||
ANDCC $7, R4, R5 // R5: leftover bytes to clear
|
ANDCC $7, R4, R5 // R5: leftover bytes to clear
|
||||||
SRAD $3, R4, R6 // R6: double words to clear
|
SRD $3, R4, R6 // R6: double words to clear
|
||||||
CMP R6, $0, CR1 // CR1[EQ] set if no double words
|
CMP R6, $0, CR1 // CR1[EQ] set if no double words
|
||||||
|
|
||||||
BC 12, 6, nozerolarge // only single bytes
|
BC 12, 6, nozerolarge // only single bytes
|
||||||
MOVD R6, CTR // R6 = number of double words
|
CMP R4, $512
|
||||||
SRADCC $2, R6, R7 // 32 byte chunks?
|
BLT under512 // special case for < 512
|
||||||
BNE zero32setup
|
ANDCC $127, R3, R8 // check for 128 alignment of address
|
||||||
|
BEQ zero512setup
|
||||||
|
|
||||||
|
ANDCC $7, R3, R15
|
||||||
|
BEQ zero512xsetup // at least 8 byte aligned
|
||||||
|
|
||||||
|
// zero bytes up to 8 byte alignment
|
||||||
|
|
||||||
|
ANDCC $1, R3, R15 // check for byte alignment
|
||||||
|
BEQ byte2
|
||||||
|
MOVB R0, 0(R3) // zero 1 byte
|
||||||
|
ADD $1, R3 // bump ptr by 1
|
||||||
|
ADD $-1, R4
|
||||||
|
|
||||||
|
byte2:
|
||||||
|
ANDCC $2, R3, R15 // check for 2 byte alignment
|
||||||
|
BEQ byte4
|
||||||
|
MOVH R0, 0(R3) // zero 2 bytes
|
||||||
|
ADD $2, R3 // bump ptr by 2
|
||||||
|
ADD $-2, R4
|
||||||
|
|
||||||
|
byte4:
|
||||||
|
ANDCC $4, R3, R15 // check for 4 byte alignment
|
||||||
|
BEQ zero512xsetup
|
||||||
|
MOVW R0, 0(R3) // zero 4 bytes
|
||||||
|
ADD $4, R3 // bump ptr by 4
|
||||||
|
ADD $-4, R4
|
||||||
|
BR zero512xsetup // ptr should now be 8 byte aligned
|
||||||
|
|
||||||
|
under512:
|
||||||
|
MOVD R6, CTR // R6 = number of double words
|
||||||
|
SRDCC $2, R6, R7 // 32 byte chunks?
|
||||||
|
BNE zero32setup
|
||||||
|
|
||||||
// Clear double words
|
// Clear double words
|
||||||
|
|
||||||
zero8:
|
zero8:
|
||||||
MOVD R0, 0(R3) // double word
|
MOVD R0, 0(R3) // double word
|
||||||
ADD $8, R3
|
ADD $8, R3
|
||||||
|
ADD $-8, R4
|
||||||
BC 16, 0, zero8 // dec ctr, br zero8 if ctr not 0
|
BC 16, 0, zero8 // dec ctr, br zero8 if ctr not 0
|
||||||
BR nozerolarge // handle remainder
|
BR nozerolarge // handle leftovers
|
||||||
|
|
||||||
// Prepare to clear 32 bytes at a time.
|
// Prepare to clear 32 bytes at a time.
|
||||||
|
|
||||||
zero32setup:
|
zero32setup:
|
||||||
DCBTST (R3) // prepare data cache
|
DCBTST (R3) // prepare data cache
|
||||||
MOVD R7, CTR // number of 32 byte chunks
|
XXLXOR VS32, VS32, VS32 // clear VS32 (V0)
|
||||||
|
MOVD R7, CTR // number of 32 byte chunks
|
||||||
|
MOVD $16, R8
|
||||||
|
|
||||||
zero32:
|
zero32:
|
||||||
MOVD R0, 0(R3) // clear 4 double words
|
STXVD2X VS32, (R3+R0) // store 16 bytes
|
||||||
MOVD R0, 8(R3)
|
STXVD2X VS32, (R3+R8)
|
||||||
MOVD R0, 16(R3)
|
|
||||||
MOVD R0, 24(R3)
|
|
||||||
ADD $32, R3
|
ADD $32, R3
|
||||||
|
ADD $-32, R4
|
||||||
BC 16, 0, zero32 // dec ctr, br zero32 if ctr not 0
|
BC 16, 0, zero32 // dec ctr, br zero32 if ctr not 0
|
||||||
RLDCLCC $61, R4, $3, R6 // remaining doublewords
|
RLDCLCC $61, R4, $3, R6 // remaining doublewords
|
||||||
BEQ nozerolarge
|
BEQ nozerolarge
|
||||||
@ -49,8 +83,8 @@ zero32:
|
|||||||
BR zero8
|
BR zero8
|
||||||
|
|
||||||
nozerolarge:
|
nozerolarge:
|
||||||
CMP R5, $0 // any remaining bytes
|
ANDCC $7, R4, R5 // any remaining bytes
|
||||||
BC 4, 1, LR // ble lr
|
BC 4, 1, LR // ble lr
|
||||||
|
|
||||||
zerotail:
|
zerotail:
|
||||||
MOVD R5, CTR // set up to clear tail bytes
|
MOVD R5, CTR // set up to clear tail bytes
|
||||||
@ -60,3 +94,70 @@ zerotailloop:
|
|||||||
ADD $1, R3
|
ADD $1, R3
|
||||||
BC 16, 0, zerotailloop // dec ctr, br zerotailloop if ctr not 0
|
BC 16, 0, zerotailloop // dec ctr, br zerotailloop if ctr not 0
|
||||||
RET
|
RET
|
||||||
|
|
||||||
|
zero512xsetup: // 512 chunk with extra needed
|
||||||
|
ANDCC $8, R3, R11 // 8 byte alignment?
|
||||||
|
BEQ zero512setup16
|
||||||
|
MOVD R0, 0(R3) // clear 8 bytes
|
||||||
|
ADD $8, R3 // update ptr to next 8
|
||||||
|
ADD $-8, R4 // dec count by 8
|
||||||
|
|
||||||
|
zero512setup16:
|
||||||
|
ANDCC $127, R3, R14 // < 128 byte alignment
|
||||||
|
BEQ zero512setup // handle 128 byte alignment
|
||||||
|
MOVD $128, R15
|
||||||
|
SUB R14, R15, R14 // find increment to 128 alignment
|
||||||
|
SRD $4, R14, R15 // number of 16 byte chunks
|
||||||
|
|
||||||
|
zero512presetup:
|
||||||
|
MOVD R15, CTR // loop counter of 16 bytes
|
||||||
|
XXLXOR VS32, VS32, VS32 // clear VS32 (V0)
|
||||||
|
|
||||||
|
zero512preloop: // clear up to 128 alignment
|
||||||
|
STXVD2X VS32, (R3+R0) // clear 16 bytes
|
||||||
|
ADD $16, R3 // update ptr
|
||||||
|
ADD $-16, R4 // dec count
|
||||||
|
BC 16, 0, zero512preloop
|
||||||
|
|
||||||
|
zero512setup: // setup for dcbz loop
|
||||||
|
CMP R4, $512 // check if at least 512
|
||||||
|
BLT remain
|
||||||
|
SRD $9, R4, R8 // loop count for 512 chunks
|
||||||
|
MOVD R8, CTR // set up counter
|
||||||
|
MOVD $128, R9 // index regs for 128 bytes
|
||||||
|
MOVD $256, R10
|
||||||
|
MOVD $384, R11
|
||||||
|
|
||||||
|
zero512:
|
||||||
|
DCBZ (R3+R0) // clear first chunk
|
||||||
|
DCBZ (R3+R9) // clear second chunk
|
||||||
|
DCBZ (R3+R10) // clear third chunk
|
||||||
|
DCBZ (R3+R11) // clear fourth chunk
|
||||||
|
ADD $512, R3
|
||||||
|
ADD $-512, R4
|
||||||
|
BC 16, 0, zero512
|
||||||
|
|
||||||
|
remain:
|
||||||
|
CMP R4, $128 // check if 128 byte chunks left
|
||||||
|
BLT smaller
|
||||||
|
DCBZ (R3+R0) // clear 128
|
||||||
|
ADD $128, R3
|
||||||
|
ADD $-128, R4
|
||||||
|
BR remain
|
||||||
|
|
||||||
|
smaller:
|
||||||
|
ANDCC $127, R4, R7 // find leftovers
|
||||||
|
BEQ done
|
||||||
|
CMP R7, $64 // more than 64, do 32 at a time
|
||||||
|
BLT zero8setup // less than 64, do 8 at a time
|
||||||
|
SRD $5, R7, R7 // set up counter for 32
|
||||||
|
BR zero32setup
|
||||||
|
|
||||||
|
zero8setup:
|
||||||
|
SRDCC $3, R7, R7 // less than 8 bytes
|
||||||
|
BEQ nozerolarge
|
||||||
|
MOVD R7, CTR
|
||||||
|
BR zero8
|
||||||
|
|
||||||
|
done:
|
||||||
|
RET
|
||||||
|
@ -16,7 +16,7 @@ TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
|
|||||||
// copy so a more efficient move can be done
|
// copy so a more efficient move can be done
|
||||||
check:
|
check:
|
||||||
ANDCC $7, R5, R7 // R7: bytes to copy
|
ANDCC $7, R5, R7 // R7: bytes to copy
|
||||||
SRAD $3, R5, R6 // R6: double words to copy
|
SRD $3, R5, R6 // R6: double words to copy
|
||||||
CMP R6, $0, CR1 // CR1[EQ] set if no double words to copy
|
CMP R6, $0, CR1 // CR1[EQ] set if no double words to copy
|
||||||
|
|
||||||
// Determine overlap by subtracting dest - src and comparing against the
|
// Determine overlap by subtracting dest - src and comparing against the
|
||||||
@ -31,9 +31,9 @@ check:
|
|||||||
// Copying forward if no overlap.
|
// Copying forward if no overlap.
|
||||||
|
|
||||||
BC 12, 6, noforwardlarge // "BEQ CR1, noforwardlarge"
|
BC 12, 6, noforwardlarge // "BEQ CR1, noforwardlarge"
|
||||||
MOVD R6,CTR // R6 = number of double words
|
SRDCC $2,R6,R8 // 32 byte chunks?
|
||||||
SRADCC $2,R6,R8 // 32 byte chunks?
|
|
||||||
BNE forward32setup //
|
BNE forward32setup //
|
||||||
|
MOVD R6,CTR // R6 = number of double words
|
||||||
|
|
||||||
// Move double words
|
// Move double words
|
||||||
|
|
||||||
@ -51,17 +51,14 @@ forward32setup:
|
|||||||
DCBTST (R3) // prepare data cache
|
DCBTST (R3) // prepare data cache
|
||||||
DCBT (R4)
|
DCBT (R4)
|
||||||
MOVD R8, CTR // double work count
|
MOVD R8, CTR // double work count
|
||||||
|
MOVD $16, R8
|
||||||
|
|
||||||
forward32:
|
forward32:
|
||||||
MOVD 0(R4), R8 // load 4 double words
|
LXVD2X (R4+R0), VS32 // load 16 bytes
|
||||||
MOVD 8(R4), R9
|
LXVD2X (R4+R8), VS33
|
||||||
MOVD 16(R4), R14
|
ADD $32, R4
|
||||||
MOVD 24(R4), R15
|
STXVD2X VS32, (R3+R0) // store 16 bytes
|
||||||
ADD $32,R4
|
STXVD2X VS33, (R3+R8)
|
||||||
MOVD R8, 0(R3) // store those 4
|
|
||||||
MOVD R9, 8(R3)
|
|
||||||
MOVD R14,16(R3)
|
|
||||||
MOVD R15,24(R3)
|
|
||||||
ADD $32,R3 // bump up for next set
|
ADD $32,R3 // bump up for next set
|
||||||
BC 16, 0, forward32 // continue
|
BC 16, 0, forward32 // continue
|
||||||
RLDCLCC $61,R5,$3,R6 // remaining doublewords
|
RLDCLCC $61,R5,$3,R6 // remaining doublewords
|
||||||
@ -71,7 +68,7 @@ forward32:
|
|||||||
|
|
||||||
noforwardlarge:
|
noforwardlarge:
|
||||||
CMP R7,$0 // any remaining bytes
|
CMP R7,$0 // any remaining bytes
|
||||||
BC 4, 1, LR
|
BC 4, 1, LR // ble lr
|
||||||
|
|
||||||
forwardtail:
|
forwardtail:
|
||||||
MOVD R7, CTR // move tail bytes
|
MOVD R7, CTR // move tail bytes
|
||||||
@ -101,19 +98,39 @@ backwardtailloop:
|
|||||||
SUB $1,R4
|
SUB $1,R4
|
||||||
MOVBZ R8, -1(R3)
|
MOVBZ R8, -1(R3)
|
||||||
SUB $1,R3
|
SUB $1,R3
|
||||||
BC 16, 0, backwardtailloop
|
BC 16, 0, backwardtailloop // bndz
|
||||||
|
|
||||||
nobackwardtail:
|
nobackwardtail:
|
||||||
CMP R6,$0
|
BC 4, 5, LR // ble CR1 lr
|
||||||
BC 4, 5, LR
|
|
||||||
|
|
||||||
backwardlarge:
|
backwardlarge:
|
||||||
MOVD R6, CTR
|
MOVD R6, CTR
|
||||||
|
SUB R3, R4, R9 // Use vsx if moving
|
||||||
|
CMP R9, $32 // at least 32 byte chunks
|
||||||
|
BLT backwardlargeloop // and distance >= 32
|
||||||
|
SRDCC $2,R6,R8 // 32 byte chunks
|
||||||
|
BNE backward32setup
|
||||||
|
|
||||||
backwardlargeloop:
|
backwardlargeloop:
|
||||||
MOVD -8(R4), R8
|
MOVD -8(R4), R8
|
||||||
SUB $8,R4
|
SUB $8,R4
|
||||||
MOVD R8, -8(R3)
|
MOVD R8, -8(R3)
|
||||||
SUB $8,R3
|
SUB $8,R3
|
||||||
BC 16, 0, backwardlargeloop //
|
BC 16, 0, backwardlargeloop // bndz
|
||||||
RET
|
RET
|
||||||
|
|
||||||
|
backward32setup:
|
||||||
|
MOVD R8, CTR // set up loop ctr
|
||||||
|
MOVD $16, R8 // 32 bytes at at time
|
||||||
|
|
||||||
|
backward32loop:
|
||||||
|
SUB $32, R4
|
||||||
|
SUB $32, R3
|
||||||
|
LXVD2X (R4+R0), VS32 // load 16 bytes
|
||||||
|
LXVD2X (R4+R8), VS33
|
||||||
|
STXVD2X VS32, (R3+R0) // store 16 bytes
|
||||||
|
STXVD2X VS33, (R3+R8)
|
||||||
|
BC 16, 0, backward32loop // bndz
|
||||||
|
BC 4, 5, LR // ble CR1 lr
|
||||||
|
MOVD R6, CTR
|
||||||
|
BR backwardlargeloop
|
||||||
|
Loading…
Reference in New Issue
Block a user