1
0
mirror of https://github.com/golang/go synced 2024-11-11 23:10:23 -07:00

runtime: improve memmove performance ppc64,ppc64le

This change improves the performance of memmove
on ppc64 & ppc64le mainly for moves >=32 bytes.
In addition, the test to detect backward moves
 was enhanced to avoid backward moves if source
and dest were in different types of storage, since
backward moves might not always be efficient.

Fixes #14507

The following shows some of the improvements from the test
in the runtime package:

BenchmarkMemmove32                   4229.56      4717.13      1.12x
BenchmarkMemmove64                   6156.03      7810.42      1.27x
BenchmarkMemmove128                  7521.69      12468.54     1.66x
BenchmarkMemmove256                  6729.90      18260.33     2.71x
BenchmarkMemmove512                  8521.59      18033.81     2.12x
BenchmarkMemmove1024                 9760.92      25762.61     2.64x
BenchmarkMemmove2048                 10241.00     29584.94     2.89x
BenchmarkMemmove4096                 10399.37     31882.31     3.07x

BenchmarkMemmoveUnalignedDst16       1943.69      2258.33      1.16x
BenchmarkMemmoveUnalignedDst32       3885.08      3965.81      1.02x
BenchmarkMemmoveUnalignedDst64       5121.63      6965.54      1.36x
BenchmarkMemmoveUnalignedDst128      7212.34      11372.68     1.58x
BenchmarkMemmoveUnalignedDst256      6564.52      16913.59     2.58x
BenchmarkMemmoveUnalignedDst512      8364.35      17782.57     2.13x
BenchmarkMemmoveUnalignedDst1024     9539.87      24914.72     2.61x
BenchmarkMemmoveUnalignedDst2048     9199.23      21235.11     2.31x
BenchmarkMemmoveUnalignedDst4096     10077.39     25231.99     2.50x

BenchmarkMemmoveUnalignedSrc32       3249.83      3742.52      1.15x
BenchmarkMemmoveUnalignedSrc64       5562.35      6627.96      1.19x
BenchmarkMemmoveUnalignedSrc128      6023.98      10200.84     1.69x
BenchmarkMemmoveUnalignedSrc256      6921.83      15258.43     2.20x
BenchmarkMemmoveUnalignedSrc512      8593.13      16541.97     1.93x
BenchmarkMemmoveUnalignedSrc1024     9730.95      22927.84     2.36x
BenchmarkMemmoveUnalignedSrc2048     9793.28      21537.73     2.20x
BenchmarkMemmoveUnalignedSrc4096     10132.96     26295.06     2.60x

Change-Id: I73af59970d4c97c728deabb9708b31ec7e01bdf2
Reviewed-on: https://go-review.googlesource.com/21990
Reviewed-by: Bill O'Farrell <billotosyr@gmail.com>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
This commit is contained in:
Lynn Boger 2016-04-13 08:58:10 -05:00 committed by Brad Fitzpatrick
parent 66afbf1010
commit c4807d4cc7

View File

@ -11,78 +11,109 @@ TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
MOVD to+0(FP), R3
MOVD from+8(FP), R4
MOVD n+16(FP), R5
CMP R5, $0
BNE check
RET
// Determine if there are doublewords to
// copy so a more efficient move can be done
check:
ANDCC $7, R5, R7 // R7 is the number of bytes to copy and CR0[EQ] is set if there are none.
SRAD $3, R5, R6 // R6 is the number of words to copy
CMP R6, $0, CR1 // CR1[EQ] is set if there are no words to copy.
ANDCC $7, R5, R7 // R7: bytes to copy
SRAD $3, R5, R6 // R6: double words to copy
CMP R6, $0, CR1 // CR1[EQ] set if no double words to copy
CMP R3, R4, CR2
BC 12, 9, backward // I think you should be able to write this as "BGT CR2, backward"
// Determine overlap by subtracting dest - src and comparing against the
// length. The catches the cases where src and dest are in different types
// of storage such as stack and static to avoid doing backward move when not
// necessary.
// Copying forward proceeds by copying R6 words then copying R7 bytes.
// R3 and R4 are advanced as we copy. Because PPC64 lacks post-increment
// load/store, R3 and R4 point before the bytes that are to be copied.
SUB R4, R3, R8 // dest - src
CMPU R8, R5, CR2 // < len?
BC 12, 8, backward // BLT CR2 backward
// Copying forward if no overlap.
BC 12, 6, noforwardlarge // "BEQ CR1, noforwardlarge"
MOVD R6,CTR // R6 = number of double words
SRADCC $2,R6,R8 // 32 byte chunks?
BNE forward32setup //
MOVD R6, CTR
// Move double words
SUB $8, R3
SUB $8, R4
forward8:
MOVD 0(R4), R8 // double word
ADD $8,R4
MOVD R8, 0(R3) //
ADD $8,R3
BC 16, 0, forward8
BR noforwardlarge // handle remainder
forwardlargeloop:
MOVDU 8(R4), R8
MOVDU R8, 8(R3)
BC 16, 0, forwardlargeloop // "BDNZ"
// Prepare for moves of 32 bytes at a time.
ADD $8, R3
ADD $8, R4
forward32setup:
DCBTST (R3) // prepare data cache
DCBT (R4)
MOVD R8, CTR // double work count
forward32:
MOVD 0(R4), R8 // load 4 double words
MOVD 8(R4), R9
MOVD 16(R4), R14
MOVD 24(R4), R15
ADD $32,R4
MOVD R8, 0(R3) // store those 4
MOVD R9, 8(R3)
MOVD R14,16(R3)
MOVD R15,24(R3)
ADD $32,R3 // bump up for next set
BC 16, 0, forward32 // continue
RLDCLCC $61,R5,$3,R6 // remaining doublewords
BEQ noforwardlarge
MOVD R6,CTR // set up the CTR
BR forward8
noforwardlarge:
BNE forwardtail // Tests the bit set by ANDCC above
RET
CMP R7,$0 // any remaining bytes
BC 4, 1, LR
forwardtail:
SUB $1, R3
SUB $1, R4
MOVD R7, CTR
MOVD R7, CTR // move tail bytes
forwardtailloop:
MOVBZU 1(R4), R8
MOVBZU R8, 1(R3)
MOVBZ 0(R4), R8 // move single bytes
ADD $1,R4
MOVBZ R8, 0(R3)
ADD $1,R3
BC 16, 0, forwardtailloop
RET
backward:
// Copying backwards proceeds by copying R7 bytes then copying R6 words.
// Copying backwards proceeds by copying R7 bytes then copying R6 double words.
// R3 and R4 are advanced to the end of the destination/source buffers
// respectively and moved back as we copy.
ADD R5, R4, R4
ADD R3, R5, R3
ADD R5, R4, R4 // end of source
ADD R3, R5, R3 // end of dest
BEQ nobackwardtail
BEQ nobackwardtail // earlier condition
MOVD R7, CTR
MOVD R7, CTR // bytes to move
backwardtailloop:
MOVBZU -1(R4), R8
MOVBZU R8, -1(R3)
MOVBZ -1(R4), R8 // point to last byte
SUB $1,R4
MOVBZ R8, -1(R3)
SUB $1,R3
BC 16, 0, backwardtailloop
nobackwardtail:
BC 4, 6, backwardlarge // "BNE CR1"
RET
CMP R6,$0
BC 4, 5, LR
backwardlarge:
MOVD R6, CTR
backwardlargeloop:
MOVDU -8(R4), R8
MOVDU R8, -8(R3)
BC 16, 0, backwardlargeloop // "BDNZ"
MOVD -8(R4), R8
SUB $8,R4
MOVD R8, -8(R3)
SUB $8,R3
BC 16, 0, backwardlargeloop //
RET