1
0
mirror of https://github.com/golang/go synced 2024-11-22 04:24:39 -07:00

runtime: optimize the function memclrNoHeapPointers on loong64

The relevant performance improved by 54.61%.

benchmark:
goos: linux
goarch: loong64
pkg: runtime
cpu: Loongson-3A6000 @ 2500.00MHz
                        │     old      │                 new                 │
                        │    sec/op    │   sec/op     vs base                │
Memclr/5                   4.803n ± 0%   2.801n ± 0%  -41.68% (p=0.000 n=20)
Memclr/16                  4.803n ± 0%   3.202n ± 0%  -33.33% (p=0.000 n=20)
Memclr/64                  9.605n ± 0%   5.061n ± 1%  -47.30% (p=0.000 n=20)
Memclr/256                 29.22n ± 0%   10.24n ± 0%  -64.96% (p=0.000 n=20)
Memclr/4096                413.4n ± 0%   106.9n ± 0%  -74.14% (p=0.000 n=20)
Memclr/65536               6.566µ ± 0%   1.673µ ± 0%  -74.52% (p=0.000 n=20)
Memclr/1M                 104.95µ ± 0%   52.51µ ± 0%  -49.97% (p=0.000 n=20)
Memclr/4M                  419.8µ ± 0%   209.9µ ± 0%  -49.99% (p=0.000 n=20)
Memclr/8M                  839.6µ ± 0%   419.9µ ± 0%  -49.98% (p=0.000 n=20)
Memclr/16M                1687.6µ ± 0%   845.3µ ± 0%  -49.91% (p=0.000 n=20)
Memclr/64M                 6.725m ± 0%   3.389m ± 0%  -49.61% (p=0.000 n=20)
MemclrUnaligned/0_5        6.003n ± 0%   4.581n ± 0%  -23.69% (p=0.000 n=20)
MemclrUnaligned/0_16       6.005n ± 0%   5.084n ± 0%  -15.33% (p=0.000 n=20)
MemclrUnaligned/0_64      10.810n ± 0%   6.229n ± 0%  -42.38% (p=0.000 n=20)
MemclrUnaligned/0_256      30.43n ± 0%   10.68n ± 0%  -64.90% (p=0.000 n=20)
MemclrUnaligned/0_4096     414.8n ± 0%   107.1n ± 0%  -74.18% (p=0.000 n=20)
MemclrUnaligned/0_65536    6.566µ ± 0%   1.700µ ± 0%  -74.11% (p=0.000 n=20)
MemclrUnaligned/1_5        6.003n ± 0%   4.582n ± 0%  -23.67% (p=0.000 n=20)
MemclrUnaligned/1_16      11.610n ± 0%   5.080n ± 0%  -56.24% (p=0.000 n=20)
MemclrUnaligned/1_64      16.810n ± 0%   7.370n ± 0%  -56.16% (p=0.000 n=20)
MemclrUnaligned/1_256      36.42n ± 0%   12.95n ± 0%  -64.44% (p=0.000 n=20)
MemclrUnaligned/1_4096     420.6n ± 0%   114.6n ± 0%  -72.75% (p=0.000 n=20)
MemclrUnaligned/1_65536    6.573µ ± 0%   1.708µ ± 0%  -74.01% (p=0.000 n=20)
MemclrUnaligned/4_5        6.003n ± 0%   4.582n ± 0%  -23.67% (p=0.000 n=20)
MemclrUnaligned/4_16      10.410n ± 0%   5.069n ± 0%  -51.30% (p=0.000 n=20)
MemclrUnaligned/4_64      15.610n ± 0%   7.372n ± 0%  -52.77% (p=0.000 n=20)
MemclrUnaligned/4_256      35.22n ± 0%   12.95n ± 0%  -63.23% (p=0.000 n=20)
MemclrUnaligned/4_4096     419.4n ± 0%   114.6n ± 0%  -72.68% (p=0.000 n=20)
MemclrUnaligned/4_65536    6.571µ ± 0%   1.708µ ± 0%  -74.01% (p=0.000 n=20)
MemclrUnaligned/7_5        6.003n ± 0%   4.581n ± 0%  -23.69% (p=0.000 n=20)
MemclrUnaligned/7_16       8.855n ± 0%   5.079n ± 0%  -42.65% (p=0.000 n=20)
MemclrUnaligned/7_64      14.010n ± 0%   7.370n ± 0%  -47.39% (p=0.000 n=20)
MemclrUnaligned/7_256      33.62n ± 0%   12.95n ± 0%  -61.48% (p=0.000 n=20)
MemclrUnaligned/7_4096     417.8n ± 0%   114.7n ± 0%  -72.56% (p=0.000 n=20)
MemclrUnaligned/7_65536    6.570µ ± 0%   1.708µ ± 0%  -74.00% (p=0.000 n=20)
MemclrUnaligned/0_1M      104.96µ ± 0%   52.51µ ± 0%  -49.97% (p=0.000 n=20)
MemclrUnaligned/0_4M       419.8µ ± 0%   209.9µ ± 0%  -49.99% (p=0.000 n=20)
MemclrUnaligned/0_8M       839.5µ ± 0%   419.8µ ± 0%  -49.99% (p=0.000 n=20)
MemclrUnaligned/0_16M     1687.9µ ± 0%   844.9µ ± 0%  -49.94% (p=0.000 n=20)
MemclrUnaligned/0_64M      6.725m ± 0%   3.382m ± 0%  -49.72% (p=0.000 n=20)
MemclrUnaligned/1_1M      104.97µ ± 0%   52.51µ ± 0%  -49.97% (p=0.000 n=20)
MemclrUnaligned/1_4M       419.8µ ± 0%   210.0µ ± 0%  -49.97% (p=0.000 n=20)
MemclrUnaligned/1_8M       839.5µ ± 0%   419.8µ ± 0%  -50.00% (p=0.000 n=20)
MemclrUnaligned/1_16M     1687.6µ ± 0%   844.2µ ± 0%  -49.97% (p=0.000 n=20)
MemclrUnaligned/1_64M      6.724m ± 0%   3.367m ± 0%  -49.93% (p=0.000 n=20)
MemclrUnaligned/4_1M      104.97µ ± 0%   52.51µ ± 0%  -49.97% (p=0.000 n=20)
MemclrUnaligned/4_4M       419.8µ ± 0%   210.0µ ± 0%  -49.97% (p=0.000 n=20)
MemclrUnaligned/4_8M       839.5µ ± 0%   419.8µ ± 0%  -50.00% (p=0.000 n=20)
MemclrUnaligned/4_16M     1687.5µ ± 0%   844.4µ ± 0%  -49.96% (p=0.000 n=20)
MemclrUnaligned/4_64M      6.725m ± 0%   3.366m ± 0%  -49.95% (p=0.000 n=20)
MemclrUnaligned/7_1M      104.97µ ± 0%   52.51µ ± 0%  -49.97% (p=0.000 n=20)
MemclrUnaligned/7_4M       419.8µ ± 0%   210.0µ ± 0%  -49.97% (p=0.000 n=20)
MemclrUnaligned/7_8M       839.5µ ± 0%   419.8µ ± 0%  -50.00% (p=0.000 n=20)
MemclrUnaligned/7_16M     1687.9µ ± 0%   844.3µ ± 0%  -49.98% (p=0.000 n=20)
MemclrUnaligned/7_64M      6.724m ± 0%   3.362m ± 0%  -50.00% (p=0.000 n=20)
geomean                    4.659µ        2.114µ       -54.61%

Change-Id: If0174e4cd8be5e17ad146698508a966158fe83e8
Reviewed-on: https://go-review.googlesource.com/c/go/+/589539
Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn>
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Reviewed-by: Meidan Li <limeidan@loongson.cn>
This commit is contained in:
Xiaolin Zhao 2024-06-03 15:49:23 +08:00 committed by abner chenc
parent 1bc795633e
commit 501b389efc

View File

@ -5,36 +5,131 @@
#include "go_asm.h"
#include "textflag.h"
// Register map
//
// R4: ptr
// R5: n
// R6: ptrend
// R7: tmp
// Algorithm:
//
// 1. when count <= 64 bytes, memory alignment check is omitted.
// The handling is divided into distinct cases based on the size
// of count: clr_0, clr_1, clr_2, clr_3, clr_4, clr_5through7,
// clr_8, clr_9through16, clr_17through32, and clr_33through64.
//
// 2. when count > 64 bytes, memory alignment check is performed.
// Unaligned bytes are processed first (that is, 8-(ptr&7)), and
// then a 64-byte loop is executed to zero out memory.
// When the number of remaining bytes not cleared is n < 64 bytes,
// a tail processing is performed, invoking the corresponding case
// based on the size of n.
//
// ptr newptr ptrend
// | |<----count after correction---->|
// |<-------------count before correction---------->|
// |<--8-(ptr&7)-->| |<---64 bytes--->|
// +------------------------------------------------+
// | Head | Body | Tail |
// +---------------+---------------+----------------+
// newptr = ptr - (ptr & 7) + 8
// count = count - 8 + (ptr & 7)
// func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB),NOSPLIT,$0-16
BEQ R5, clr_0
ADDV R4, R5, R6
// if less than 8 bytes, do one byte at a time
SGTU $8, R5, R8
BNE R8, out
tail:
// <=64 bytes, clear directly, not check aligned
SGTU $2, R5, R7
BNE R7, clr_1
SGTU $3, R5, R7
BNE R7, clr_2
SGTU $4, R5, R7
BNE R7, clr_3
SGTU $5, R5, R7
BNE R7, clr_4
SGTU $8, R5, R7
BNE R7, clr_5through7
SGTU $9, R5, R7
BNE R7, clr_8
SGTU $17, R5, R7
BNE R7, clr_9through16
SGTU $33, R5, R7
BNE R7, clr_17through32
SGTU $65, R5, R7
BNE R7, clr_33through64
// do one byte at a time until 8-aligned
AND $7, R4, R8
BEQ R8, words
MOVB R0, (R4)
ADDV $1, R4
JMP -4(PC)
// n > 64 bytes, check aligned
AND $7, R4, R7
BEQ R7, body
words:
// do 8 bytes at a time if there is room
ADDV $-7, R6, R5
PCALIGN $16
SGTU R5, R4, R8
BEQ R8, out
head:
MOVV R0, (R4)
ADDV $8, R4
JMP -4(PC)
SUBV R7, R4
ADDV R7, R5
ADDV $8, R4 // newptr = ptr + (8 - (ptr & 7))
SUBV $8, R5 // newn = n - (8 - (ptr & 7))
SGTU $65, R5, R7
BNE R7, clr_33through64
out:
BEQ R4, R6, done
MOVB R0, (R4)
ADDV $1, R4
JMP -3(PC)
done:
body:
MOVV R0, (R4)
MOVV R0, 8(R4)
MOVV R0, 16(R4)
MOVV R0, 24(R4)
MOVV R0, 32(R4)
MOVV R0, 40(R4)
MOVV R0, 48(R4)
MOVV R0, 56(R4)
ADDV $-64, R5
ADDV $64, R4
SGTU $65, R5, R7
BEQ R7, body
BEQ R5, clr_0
JMP tail
clr_0:
RET
clr_1:
MOVB R0, (R4)
RET
clr_2:
MOVH R0, (R4)
RET
clr_3:
MOVH R0, (R4)
MOVB R0, 2(R4)
RET
clr_4:
MOVW R0, (R4)
RET
clr_5through7:
MOVW R0, (R4)
MOVW R0, -4(R6)
RET
clr_8:
MOVV R0, (R4)
RET
clr_9through16:
MOVV R0, (R4)
MOVV R0, -8(R6)
RET
clr_17through32:
MOVV R0, (R4)
MOVV R0, 8(R4)
MOVV R0, -16(R6)
MOVV R0, -8(R6)
RET
clr_33through64:
MOVV R0, (R4)
MOVV R0, 8(R4)
MOVV R0, 16(R4)
MOVV R0, 24(R4)
MOVV R0, -32(R6)
MOVV R0, -24(R6)
MOVV R0, -16(R6)
MOVV R0, -8(R6)
RET