mirror of
https://github.com/golang/go
synced 2024-11-17 17:54:48 -07:00
runtime: improve memmove for short moves on ppc64
This improves the performance of memmove for almost all moves <= 16 bytes for the ppc64 assembler, improving linux/ppc64le, linux/ppc64, aix/ppc64. Only the forward moves were changed, the backward moves were left as is. Additional macro defines were added to improve the readability of the asm. Results from power8: name old time/op new time/op delta Memmove/0 5.70ns ± 0% 5.69ns ± 0% -0.18% (p=0.029 n=4+4) Memmove/1 5.54ns ± 0% 5.39ns ± 0% -2.71% (p=0.029 n=4+4) Memmove/2 6.31ns ± 0% 5.55ns ± 0% -12.08% (p=0.029 n=4+4) Memmove/3 7.41ns ± 0% 5.54ns ± 0% -25.24% (p=0.029 n=4+4) Memmove/4 8.41ns ± 0% 5.56ns ± 0% -33.87% (p=0.029 n=4+4) Memmove/5 10.1ns ± 5% 5.5ns ± 0% -45.30% (p=0.029 n=4+4) Memmove/6 10.3ns ± 0% 5.6ns ± 0% -45.92% (p=0.029 n=4+4) Memmove/7 11.4ns ± 0% 5.7ns ± 0% -50.33% (p=0.029 n=4+4) Memmove/8 5.66ns ± 0% 5.54ns ± 0% -2.12% (p=0.029 n=4+4) Memmove/9 5.66ns ± 0% 6.47ns ± 0% +14.31% (p=0.029 n=4+4) Memmove/10 6.67ns ± 0% 6.22ns ± 0% -6.82% (p=0.029 n=4+4) Memmove/11 7.83ns ± 0% 6.45ns ± 0% -17.60% (p=0.029 n=4+4) Memmove/12 8.91ns ± 0% 6.25ns ± 0% -29.85% (p=0.029 n=4+4) Memmove/13 9.81ns ± 0% 6.48ns ± 0% -33.94% (p=0.029 n=4+4) Memmove/14 10.7ns ± 1% 6.4ns ± 0% -40.00% (p=0.029 n=4+4) Memmove/15 11.8ns ± 0% 6.7ns ± 0% -42.84% (p=0.029 n=4+4) Memmove/16 5.63ns ± 0% 5.56ns ± 0% -1.20% (p=0.029 n=4+4) Change-Id: I2de434f543c5a017395e0850fb9b9f7219583bbb Reviewed-on: https://go-review.googlesource.com/c/go/+/223317 Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
This commit is contained in:
parent
f9f57c4443
commit
cbd421f75b
@ -9,78 +9,112 @@
|
||||
// See memmove Go doc for important implementation constraints.
|
||||
|
||||
// func memmove(to, from unsafe.Pointer, n uintptr)
|
||||
|
||||
// target address
|
||||
#define TGT R3
|
||||
// source address
|
||||
#define SRC R4
|
||||
// length to move
|
||||
#define LEN R5
|
||||
// number of doublewords
|
||||
#define DWORDS R6
|
||||
// number of bytes < 8
|
||||
#define BYTES R7
|
||||
// const 16 used as index
|
||||
#define IDX16 R8
|
||||
// temp used for copies, etc.
|
||||
#define TMP R9
|
||||
// number of 32 byte chunks
|
||||
#define QWORDS R10
|
||||
|
||||
TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
|
||||
MOVD to+0(FP), R3
|
||||
MOVD from+8(FP), R4
|
||||
MOVD n+16(FP), R5
|
||||
MOVD to+0(FP), TGT
|
||||
MOVD from+8(FP), SRC
|
||||
MOVD n+16(FP), LEN
|
||||
|
||||
// Determine if there are doublewords to
|
||||
// copy so a more efficient move can be done
|
||||
check:
|
||||
ANDCC $7, R5, R7 // R7: bytes to copy
|
||||
SRD $3, R5, R6 // R6: double words to copy
|
||||
CMP R6, $0, CR1 // CR1[EQ] set if no double words to copy
|
||||
ANDCC $7, LEN, BYTES // R7: bytes to copy
|
||||
SRD $3, LEN, DWORDS // R6: double words to copy
|
||||
MOVFL CR0, CR3 // save CR from ANDCC
|
||||
CMP DWORDS, $0, CR1 // CR1[EQ] set if no double words to copy
|
||||
|
||||
// Determine overlap by subtracting dest - src and comparing against the
|
||||
// length. The catches the cases where src and dest are in different types
|
||||
// length. This catches the cases where src and dest are in different types
|
||||
// of storage such as stack and static to avoid doing backward move when not
|
||||
// necessary.
|
||||
|
||||
SUB R4, R3, R8 // dest - src
|
||||
CMPU R8, R5, CR2 // < len?
|
||||
SUB SRC, TGT, TMP // dest - src
|
||||
CMPU TMP, LEN, CR2 // < len?
|
||||
BC 12, 8, backward // BLT CR2 backward
|
||||
|
||||
// Copying forward if no overlap.
|
||||
|
||||
BC 12, 6, noforwardlarge // "BEQ CR1, noforwardlarge"
|
||||
SRDCC $2,R6,R8 // 32 byte chunks?
|
||||
BNE forward32setup //
|
||||
MOVD R6,CTR // R6 = number of double words
|
||||
|
||||
// Move double words
|
||||
|
||||
forward8:
|
||||
MOVD 0(R4), R8 // double word
|
||||
ADD $8,R4
|
||||
MOVD R8, 0(R3) //
|
||||
ADD $8,R3
|
||||
BC 16, 0, forward8
|
||||
BR noforwardlarge // handle remainder
|
||||
BC 12, 6, checkbytes // BEQ CR1, checkbytes
|
||||
SRDCC $2, DWORDS, QWORDS // 32 byte chunks?
|
||||
BEQ lt32gt8 // < 32 bytes
|
||||
|
||||
// Prepare for moves of 32 bytes at a time.
|
||||
|
||||
forward32setup:
|
||||
DCBTST (R3) // prepare data cache
|
||||
DCBT (R4)
|
||||
MOVD R8, CTR // double work count
|
||||
MOVD $16, R8
|
||||
DCBTST (TGT) // prepare data cache
|
||||
DCBT (SRC)
|
||||
MOVD QWORDS, CTR // Number of 32 byte chunks
|
||||
MOVD $16, IDX16 // 16 for index
|
||||
|
||||
forward32:
|
||||
LXVD2X (R4+R0), VS32 // load 16 bytes
|
||||
LXVD2X (R4+R8), VS33
|
||||
ADD $32, R4
|
||||
STXVD2X VS32, (R3+R0) // store 16 bytes
|
||||
STXVD2X VS33, (R3+R8)
|
||||
ADD $32,R3 // bump up for next set
|
||||
LXVD2X (R0)(SRC), VS32 // load 16 bytes
|
||||
LXVD2X (IDX16)(SRC), VS33 // load 16 bytes
|
||||
ADD $32, SRC
|
||||
STXVD2X VS32, (R0)(TGT) // store 16 bytes
|
||||
STXVD2X VS33, (IDX16)(TGT)
|
||||
ADD $32,TGT // bump up for next set
|
||||
BC 16, 0, forward32 // continue
|
||||
RLDCLCC $61,R5,$3,R6 // remaining doublewords
|
||||
BEQ noforwardlarge
|
||||
MOVD R6,CTR // set up the CTR
|
||||
BR forward8
|
||||
ANDCC $3, DWORDS // remaining doublewords
|
||||
BEQ checkbytes // only bytes remain
|
||||
|
||||
noforwardlarge:
|
||||
CMP R7,$0 // any remaining bytes
|
||||
BC 4, 1, LR // ble lr
|
||||
lt32gt8:
|
||||
// At this point >= 8 and < 32
|
||||
// Move 16 bytes if possible
|
||||
CMP DWORDS, $2
|
||||
BLT lt16
|
||||
LXVD2X (R0)(SRC), VS32
|
||||
ADD $-2, DWORDS
|
||||
STXVD2X VS32, (R0)(TGT)
|
||||
ADD $16, SRC
|
||||
ADD $16, TGT
|
||||
|
||||
forwardtail:
|
||||
MOVD R7, CTR // move tail bytes
|
||||
|
||||
forwardtailloop:
|
||||
MOVBZ 0(R4), R8 // move single bytes
|
||||
ADD $1,R4
|
||||
MOVBZ R8, 0(R3)
|
||||
ADD $1,R3
|
||||
BC 16, 0, forwardtailloop
|
||||
lt16: // Move 8 bytes if possible
|
||||
CMP DWORDS, $1
|
||||
BLT checkbytes
|
||||
MOVD 0(SRC), TMP
|
||||
ADD $8, SRC
|
||||
MOVD TMP, 0(TGT)
|
||||
ADD $8, TGT
|
||||
checkbytes:
|
||||
BC 12, 14, LR // BEQ lr
|
||||
lt8: // Move word if possible
|
||||
CMP BYTES, $4
|
||||
BLT lt4
|
||||
MOVWZ 0(SRC), TMP
|
||||
ADD $-4, BYTES
|
||||
MOVW TMP, 0(TGT)
|
||||
ADD $4, SRC
|
||||
ADD $4, TGT
|
||||
lt4: // Move halfword if possible
|
||||
CMP BYTES, $2
|
||||
BLT lt2
|
||||
MOVHZ 0(SRC), TMP
|
||||
ADD $-2, BYTES
|
||||
MOVH TMP, 0(TGT)
|
||||
ADD $2, SRC
|
||||
ADD $2, TGT
|
||||
lt2: // Move last byte if 1 left
|
||||
CMP BYTES, $1
|
||||
BC 12, 0, LR // ble lr
|
||||
MOVBZ 0(SRC), TMP
|
||||
MOVBZ TMP, 0(TGT)
|
||||
RET
|
||||
|
||||
backward:
|
||||
@ -88,51 +122,51 @@ backward:
|
||||
// R3 and R4 are advanced to the end of the destination/source buffers
|
||||
// respectively and moved back as we copy.
|
||||
|
||||
ADD R5, R4, R4 // end of source
|
||||
ADD R3, R5, R3 // end of dest
|
||||
ADD LEN, SRC, SRC // end of source
|
||||
ADD TGT, LEN, TGT // end of dest
|
||||
|
||||
BEQ nobackwardtail // earlier condition
|
||||
|
||||
MOVD R7, CTR // bytes to move
|
||||
MOVD BYTES, CTR // bytes to move
|
||||
|
||||
backwardtailloop:
|
||||
MOVBZ -1(R4), R8 // point to last byte
|
||||
SUB $1,R4
|
||||
MOVBZ R8, -1(R3)
|
||||
SUB $1,R3
|
||||
MOVBZ -1(SRC), TMP // point to last byte
|
||||
SUB $1,SRC
|
||||
MOVBZ TMP, -1(TGT)
|
||||
SUB $1,TGT
|
||||
BC 16, 0, backwardtailloop // bndz
|
||||
|
||||
nobackwardtail:
|
||||
BC 4, 5, LR // ble CR1 lr
|
||||
|
||||
backwardlarge:
|
||||
MOVD R6, CTR
|
||||
SUB R3, R4, R9 // Use vsx if moving
|
||||
CMP R9, $32 // at least 32 byte chunks
|
||||
MOVD DWORDS, CTR
|
||||
SUB TGT, SRC, TMP // Use vsx if moving
|
||||
CMP TMP, $32 // at least 32 byte chunks
|
||||
BLT backwardlargeloop // and distance >= 32
|
||||
SRDCC $2,R6,R8 // 32 byte chunks
|
||||
SRDCC $2,DWORDS,QWORDS // 32 byte chunks
|
||||
BNE backward32setup
|
||||
|
||||
backwardlargeloop:
|
||||
MOVD -8(R4), R8
|
||||
SUB $8,R4
|
||||
MOVD R8, -8(R3)
|
||||
SUB $8,R3
|
||||
MOVD -8(SRC), TMP
|
||||
SUB $8,SRC
|
||||
MOVD TMP, -8(TGT)
|
||||
SUB $8,TGT
|
||||
BC 16, 0, backwardlargeloop // bndz
|
||||
RET
|
||||
|
||||
backward32setup:
|
||||
MOVD R8, CTR // set up loop ctr
|
||||
MOVD $16, R8 // 32 bytes at at time
|
||||
MOVD QWORDS, CTR // set up loop ctr
|
||||
MOVD $16, IDX16 // 32 bytes at at time
|
||||
|
||||
backward32loop:
|
||||
SUB $32, R4
|
||||
SUB $32, R3
|
||||
LXVD2X (R4+R0), VS32 // load 16 bytes
|
||||
LXVD2X (R4+R8), VS33
|
||||
STXVD2X VS32, (R3+R0) // store 16 bytes
|
||||
STXVD2X VS33, (R3+R8)
|
||||
SUB $32, TGT
|
||||
SUB $32, SRC
|
||||
LXVD2X (R0)(TGT), VS32 // load 16 bytes
|
||||
LXVD2X (IDX16)(TGT), VS33
|
||||
STXVD2X VS32, (R0)(SRC) // store 16 bytes
|
||||
STXVD2X VS33, (IDX16)(SRC)
|
||||
BC 16, 0, backward32loop // bndz
|
||||
BC 4, 5, LR // ble CR1 lr
|
||||
MOVD R6, CTR
|
||||
MOVD DWORDS, CTR
|
||||
BR backwardlargeloop
|
||||
|
Loading…
Reference in New Issue
Block a user