1
0
mirror of https://github.com/golang/go synced 2024-09-29 21:34:28 -06:00

runtime: improve memmove for short moves on ppc64

This improves the performance of memmove for almost all moves <= 16 bytes
for the ppc64 assembler, improving linux/ppc64le, linux/ppc64, aix/ppc64.

Only the forward moves were changed, the backward moves were left as is.
Additional macro defines were added to improve the readability of the asm.

Results from power8:
name                      old time/op    new time/op    delta
Memmove/0                   5.70ns ± 0%    5.69ns ± 0%    -0.18%  (p=0.029 n=4+4)
Memmove/1                   5.54ns ± 0%    5.39ns ± 0%    -2.71%  (p=0.029 n=4+4)
Memmove/2                   6.31ns ± 0%    5.55ns ± 0%   -12.08%  (p=0.029 n=4+4)
Memmove/3                   7.41ns ± 0%    5.54ns ± 0%   -25.24%  (p=0.029 n=4+4)
Memmove/4                   8.41ns ± 0%    5.56ns ± 0%   -33.87%  (p=0.029 n=4+4)
Memmove/5                   10.1ns ± 5%     5.5ns ± 0%   -45.30%  (p=0.029 n=4+4)
Memmove/6                   10.3ns ± 0%     5.6ns ± 0%   -45.92%  (p=0.029 n=4+4)
Memmove/7                   11.4ns ± 0%     5.7ns ± 0%   -50.33%  (p=0.029 n=4+4)
Memmove/8                   5.66ns ± 0%    5.54ns ± 0%    -2.12%  (p=0.029 n=4+4)
Memmove/9                   5.66ns ± 0%    6.47ns ± 0%   +14.31%  (p=0.029 n=4+4)
Memmove/10                  6.67ns ± 0%    6.22ns ± 0%    -6.82%  (p=0.029 n=4+4)
Memmove/11                  7.83ns ± 0%    6.45ns ± 0%   -17.60%  (p=0.029 n=4+4)
Memmove/12                  8.91ns ± 0%    6.25ns ± 0%   -29.85%  (p=0.029 n=4+4)
Memmove/13                  9.81ns ± 0%    6.48ns ± 0%   -33.94%  (p=0.029 n=4+4)
Memmove/14                  10.7ns ± 1%     6.4ns ± 0%   -40.00%  (p=0.029 n=4+4)
Memmove/15                  11.8ns ± 0%     6.7ns ± 0%   -42.84%  (p=0.029 n=4+4)
Memmove/16                  5.63ns ± 0%    5.56ns ± 0%    -1.20%  (p=0.029 n=4+4)

Change-Id: I2de434f543c5a017395e0850fb9b9f7219583bbb
Reviewed-on: https://go-review.googlesource.com/c/go/+/223317
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
This commit is contained in:
Lynn Boger 2020-03-12 10:26:01 -04:00
parent f9f57c4443
commit cbd421f75b

View File

@ -9,78 +9,112 @@
// See memmove Go doc for important implementation constraints.
// func memmove(to, from unsafe.Pointer, n uintptr)
// target address
#define TGT R3
// source address
#define SRC R4
// length to move
#define LEN R5
// number of doublewords
#define DWORDS R6
// number of bytes < 8
#define BYTES R7
// const 16 used as index
#define IDX16 R8
// temp used for copies, etc.
#define TMP R9
// number of 32 byte chunks
#define QWORDS R10
TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
MOVD to+0(FP), R3
MOVD from+8(FP), R4
MOVD n+16(FP), R5
MOVD to+0(FP), TGT
MOVD from+8(FP), SRC
MOVD n+16(FP), LEN
// Determine if there are doublewords to
// copy so a more efficient move can be done
check:
ANDCC $7, R5, R7 // R7: bytes to copy
SRD $3, R5, R6 // R6: double words to copy
CMP R6, $0, CR1 // CR1[EQ] set if no double words to copy
ANDCC $7, LEN, BYTES // R7: bytes to copy
SRD $3, LEN, DWORDS // R6: double words to copy
MOVFL CR0, CR3 // save CR from ANDCC
CMP DWORDS, $0, CR1 // CR1[EQ] set if no double words to copy
// Determine overlap by subtracting dest - src and comparing against the
// length. The catches the cases where src and dest are in different types
// length. This catches the cases where src and dest are in different types
// of storage such as stack and static to avoid doing backward move when not
// necessary.
SUB R4, R3, R8 // dest - src
CMPU R8, R5, CR2 // < len?
SUB SRC, TGT, TMP // dest - src
CMPU TMP, LEN, CR2 // < len?
BC 12, 8, backward // BLT CR2 backward
// Copying forward if no overlap.
BC 12, 6, noforwardlarge // "BEQ CR1, noforwardlarge"
SRDCC $2,R6,R8 // 32 byte chunks?
BNE forward32setup //
MOVD R6,CTR // R6 = number of double words
// Move double words
forward8:
MOVD 0(R4), R8 // double word
ADD $8,R4
MOVD R8, 0(R3) //
ADD $8,R3
BC 16, 0, forward8
BR noforwardlarge // handle remainder
BC 12, 6, checkbytes // BEQ CR1, checkbytes
SRDCC $2, DWORDS, QWORDS // 32 byte chunks?
BEQ lt32gt8 // < 32 bytes
// Prepare for moves of 32 bytes at a time.
forward32setup:
DCBTST (R3) // prepare data cache
DCBT (R4)
MOVD R8, CTR // double work count
MOVD $16, R8
DCBTST (TGT) // prepare data cache
DCBT (SRC)
MOVD QWORDS, CTR // Number of 32 byte chunks
MOVD $16, IDX16 // 16 for index
forward32:
LXVD2X (R4+R0), VS32 // load 16 bytes
LXVD2X (R4+R8), VS33
ADD $32, R4
STXVD2X VS32, (R3+R0) // store 16 bytes
STXVD2X VS33, (R3+R8)
ADD $32,R3 // bump up for next set
LXVD2X (R0)(SRC), VS32 // load 16 bytes
LXVD2X (IDX16)(SRC), VS33 // load 16 bytes
ADD $32, SRC
STXVD2X VS32, (R0)(TGT) // store 16 bytes
STXVD2X VS33, (IDX16)(TGT)
ADD $32,TGT // bump up for next set
BC 16, 0, forward32 // continue
RLDCLCC $61,R5,$3,R6 // remaining doublewords
BEQ noforwardlarge
MOVD R6,CTR // set up the CTR
BR forward8
ANDCC $3, DWORDS // remaining doublewords
BEQ checkbytes // only bytes remain
noforwardlarge:
CMP R7,$0 // any remaining bytes
BC 4, 1, LR // ble lr
lt32gt8:
// At this point >= 8 and < 32
// Move 16 bytes if possible
CMP DWORDS, $2
BLT lt16
LXVD2X (R0)(SRC), VS32
ADD $-2, DWORDS
STXVD2X VS32, (R0)(TGT)
ADD $16, SRC
ADD $16, TGT
forwardtail:
MOVD R7, CTR // move tail bytes
forwardtailloop:
MOVBZ 0(R4), R8 // move single bytes
ADD $1,R4
MOVBZ R8, 0(R3)
ADD $1,R3
BC 16, 0, forwardtailloop
lt16: // Move 8 bytes if possible
CMP DWORDS, $1
BLT checkbytes
MOVD 0(SRC), TMP
ADD $8, SRC
MOVD TMP, 0(TGT)
ADD $8, TGT
checkbytes:
BC 12, 14, LR // BEQ lr
lt8: // Move word if possible
CMP BYTES, $4
BLT lt4
MOVWZ 0(SRC), TMP
ADD $-4, BYTES
MOVW TMP, 0(TGT)
ADD $4, SRC
ADD $4, TGT
lt4: // Move halfword if possible
CMP BYTES, $2
BLT lt2
MOVHZ 0(SRC), TMP
ADD $-2, BYTES
MOVH TMP, 0(TGT)
ADD $2, SRC
ADD $2, TGT
lt2: // Move last byte if 1 left
CMP BYTES, $1
BC 12, 0, LR // ble lr
MOVBZ 0(SRC), TMP
MOVBZ TMP, 0(TGT)
RET
backward:
@ -88,51 +122,51 @@ backward:
// R3 and R4 are advanced to the end of the destination/source buffers
// respectively and moved back as we copy.
ADD R5, R4, R4 // end of source
ADD R3, R5, R3 // end of dest
ADD LEN, SRC, SRC // end of source
ADD TGT, LEN, TGT // end of dest
BEQ nobackwardtail // earlier condition
MOVD R7, CTR // bytes to move
MOVD BYTES, CTR // bytes to move
backwardtailloop:
MOVBZ -1(R4), R8 // point to last byte
SUB $1,R4
MOVBZ R8, -1(R3)
SUB $1,R3
MOVBZ -1(SRC), TMP // point to last byte
SUB $1,SRC
MOVBZ TMP, -1(TGT)
SUB $1,TGT
BC 16, 0, backwardtailloop // bndz
nobackwardtail:
BC 4, 5, LR // ble CR1 lr
backwardlarge:
MOVD R6, CTR
SUB R3, R4, R9 // Use vsx if moving
CMP R9, $32 // at least 32 byte chunks
MOVD DWORDS, CTR
SUB TGT, SRC, TMP // Use vsx if moving
CMP TMP, $32 // at least 32 byte chunks
BLT backwardlargeloop // and distance >= 32
SRDCC $2,R6,R8 // 32 byte chunks
SRDCC $2,DWORDS,QWORDS // 32 byte chunks
BNE backward32setup
backwardlargeloop:
MOVD -8(R4), R8
SUB $8,R4
MOVD R8, -8(R3)
SUB $8,R3
MOVD -8(SRC), TMP
SUB $8,SRC
MOVD TMP, -8(TGT)
SUB $8,TGT
BC 16, 0, backwardlargeloop // bndz
RET
backward32setup:
MOVD R8, CTR // set up loop ctr
MOVD $16, R8 // 32 bytes at at time
MOVD QWORDS, CTR // set up loop ctr
MOVD $16, IDX16 // 32 bytes at at time
backward32loop:
SUB $32, R4
SUB $32, R3
LXVD2X (R4+R0), VS32 // load 16 bytes
LXVD2X (R4+R8), VS33
STXVD2X VS32, (R3+R0) // store 16 bytes
STXVD2X VS33, (R3+R8)
SUB $32, TGT
SUB $32, SRC
LXVD2X (R0)(TGT), VS32 // load 16 bytes
LXVD2X (IDX16)(TGT), VS33
STXVD2X VS32, (R0)(SRC) // store 16 bytes
STXVD2X VS33, (IDX16)(SRC)
BC 16, 0, backward32loop // bndz
BC 4, 5, LR // ble CR1 lr
MOVD R6, CTR
MOVD DWORDS, CTR
BR backwardlargeloop