1
0
mirror of https://github.com/golang/go synced 2024-11-22 03:14:41 -07:00

runtime: improve CALLFN macro for loong64

The previous CALLFN macro was copying a single byte at a time
which is inefficient on loong64. In this CL, according to the
argsize, copy 16 bytes or 8 bytes at a time, and copy 1 byte
a time for the rest.

benchmark in reflect on 3A5000 and 3A6000:

goos: linux
goarch: loong64
pkg: reflect
cpu: Loongson-3A6000 @ 2500.00MHz
                       |  bench.old   |              bench.new              |
                       |    sec/op    |   sec/op     vs base                |
CallArgCopy/size=128      360.2n ± 0%   266.9n ± 0%  -25.90% (p=0.000 n=20)
CallArgCopy/size=256      473.2n ± 0%   277.5n ± 0%  -41.35% (p=0.000 n=20)
CallArgCopy/size=1024    1128.0n ± 0%   332.9n ± 0%  -70.49% (p=0.000 n=20)
CallArgCopy/size=4096    3743.0n ± 0%   672.6n ± 0%  -82.03% (p=0.000 n=20)
CallArgCopy/size=65536   58.888µ ± 0%   9.667µ ± 0%  -83.58% (p=0.000 n=20)
geomean                   2.116µ        693.4n       -67.22%

                       |  bench.old   |               bench.new                |
                       |     B/s      |      B/s       vs base                 |
CallArgCopy/size=128     338.9Mi ± 0%    457.3Mi ± 0%   +34.94% (p=0.000 n=20)
CallArgCopy/size=256     516.0Mi ± 0%    879.8Mi ± 0%   +70.52% (p=0.000 n=20)
CallArgCopy/size=1024    865.5Mi ± 0%   2933.6Mi ± 0%  +238.94% (p=0.000 n=20)
CallArgCopy/size=4096    1.019Gi ± 0%    5.672Gi ± 0%  +456.52% (p=0.000 n=20)
CallArgCopy/size=65536   1.036Gi ± 0%    6.313Gi ± 0%  +509.13% (p=0.000 n=20)
geomean                  699.6Mi         2.085Gi       +205.10%

goos: linux
goarch: loong64
pkg: reflect
cpu: Loongson-3A5000 @ 2500.00MHz
                       |  bench.old   |              bench.new              |
                       |    sec/op    |   sec/op     vs base                |
CallArgCopy/size=128      466.6n ± 0%   368.7n ± 0%  -20.98% (p=0.000 n=20)
CallArgCopy/size=256      579.4n ± 0%   384.6n ± 0%  -33.62% (p=0.000 n=20)
CallArgCopy/size=1024    1273.0n ± 0%   492.0n ± 0%  -61.35% (p=0.000 n=20)
CallArgCopy/size=4096    4049.0n ± 0%   978.1n ± 0%  -75.84% (p=0.000 n=20)
CallArgCopy/size=65536    69.01µ ± 0%   14.50µ ± 0%  -78.99% (p=0.000 n=20)
geomean                   2.492µ        997.9n       -59.96%

                       |  bench.old   |               bench.new                |
                       |     B/s      |      B/s       vs base                 |
CallArgCopy/size=128     261.6Mi ± 0%    331.0Mi ± 0%   +26.54% (p=0.000 n=20)
CallArgCopy/size=256     421.4Mi ± 0%    634.8Mi ± 0%   +50.66% (p=0.000 n=20)
CallArgCopy/size=1024    767.2Mi ± 0%   1985.0Mi ± 0%  +158.75% (p=0.000 n=20)
CallArgCopy/size=4096    964.8Mi ± 0%   3993.8Mi ± 0%  +313.95% (p=0.000 n=20)
CallArgCopy/size=65536   905.7Mi ± 0%   4310.6Mi ± 0%  +375.97% (p=0.000 n=20)
geomean                  593.9Mi         1.449Gi       +149.76%

Change-Id: I9570395af80b2e4b760058098a1b5b07d4b37ad7
Reviewed-on: https://go-review.googlesource.com/c/go/+/627175
Reviewed-by: Meidan Li <limeidan@loongson.cn>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Reviewed-by: Cherry Mui <cherryyz@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
This commit is contained in:
Xiaolin Zhao 2024-11-12 15:50:03 +08:00 committed by abner chenc
parent 137328f92d
commit 992f63583a
2 changed files with 47 additions and 12 deletions

View File

@ -347,32 +347,65 @@ TEXT NAME(SB), WRAPPER, $MAXSIZE-48; \
NO_LOCAL_POINTERS; \ NO_LOCAL_POINTERS; \
/* copy arguments to stack */ \ /* copy arguments to stack */ \
MOVV arg+16(FP), R4; \ MOVV arg+16(FP), R4; \
MOVWU argsize+24(FP), R5; \ MOVWU argsize+24(FP), R5; \
MOVV R3, R12; \ MOVV R3, R12; \
MOVV $16, R13; \
ADDV $8, R12; \ ADDV $8, R12; \
ADDV R12, R5; \ BLT R5, R13, check8; \
BEQ R12, R5, 6(PC); \ /* copy 16 bytes a time */ \
MOVBU (R4), R6; \ MOVBU internalcpu·Loong64+const_offsetLOONG64HasLSX(SB), R16; \
ADDV $1, R4; \ BEQ R16, copy16_again; \
MOVBU R6, (R12); \ loop16:; \
ADDV $1, R12; \ VMOVQ (R4), V0; \
JMP -5(PC); \ ADDV $16, R4; \
ADDV $-16, R5; \
VMOVQ V0, (R12); \
ADDV $16, R12; \
BGE R5, R13, loop16; \
JMP check8; \
copy16_again:; \
MOVV (R4), R14; \
MOVV 8(R4), R15; \
ADDV $16, R4; \
ADDV $-16, R5; \
MOVV R14, (R12); \
MOVV R15, 8(R12); \
ADDV $16, R12; \
BGE R5, R13, copy16_again; \
check8:; \
/* R13 = 8 */; \
SRLV $1, R13; \
BLT R5, R13, 6(PC); \
/* copy 8 bytes a time */ \
MOVV (R4), R14; \
ADDV $8, R4; \
ADDV $-8, R5; \
MOVV R14, (R12); \
ADDV $8, R12; \
BEQ R5, R0, 7(PC); \
/* copy 1 byte a time for the rest */ \
MOVBU (R4), R14; \
ADDV $1, R4; \
ADDV $-1, R5; \
MOVBU R14, (R12); \
ADDV $1, R12; \
JMP -6(PC); \
/* set up argument registers */ \ /* set up argument registers */ \
MOVV regArgs+40(FP), R25; \ MOVV regArgs+40(FP), R25; \
JAL ·unspillArgs(SB); \ JAL ·unspillArgs(SB); \
/* call function */ \ /* call function */ \
MOVV f+8(FP), REGCTXT; \ MOVV f+8(FP), REGCTXT; \
MOVV (REGCTXT), R25; \ MOVV (REGCTXT), R25; \
PCDATA $PCDATA_StackMapIndex, $0; \ PCDATA $PCDATA_StackMapIndex, $0; \
JAL (R25); \ JAL (R25); \
/* copy return values back */ \ /* copy return values back */ \
MOVV regArgs+40(FP), R25; \ MOVV regArgs+40(FP), R25; \
JAL ·spillArgs(SB); \ JAL ·spillArgs(SB); \
MOVV argtype+0(FP), R7; \ MOVV argtype+0(FP), R7; \
MOVV arg+16(FP), R4; \ MOVV arg+16(FP), R4; \
MOVWU n+24(FP), R5; \ MOVWU n+24(FP), R5; \
MOVWU retoffset+28(FP), R6; \ MOVWU retoffset+28(FP), R6; \
ADDV $8, R3, R12; \ ADDV $8, R3, R12; \
ADDV R6, R12; \ ADDV R6, R12; \
ADDV R6, R4; \ ADDV R6, R4; \
SUBVU R6, R5; \ SUBVU R6, R5; \

View File

@ -19,6 +19,8 @@ const (
offsetARMHasIDIVA = unsafe.Offsetof(cpu.ARM.HasIDIVA) offsetARMHasIDIVA = unsafe.Offsetof(cpu.ARM.HasIDIVA)
offsetMIPS64XHasMSA = unsafe.Offsetof(cpu.MIPS64X.HasMSA) offsetMIPS64XHasMSA = unsafe.Offsetof(cpu.MIPS64X.HasMSA)
offsetLOONG64HasLSX = unsafe.Offsetof(cpu.Loong64.HasLSX)
) )
var ( var (