mirror of
https://github.com/golang/go
synced 2024-11-22 04:24:39 -07:00
runtime: optimize the function memclrNoHeapPointers on loong64
The relevant performance improved by 54.61%. benchmark: goos: linux goarch: loong64 pkg: runtime cpu: Loongson-3A6000 @ 2500.00MHz │ old │ new │ │ sec/op │ sec/op vs base │ Memclr/5 4.803n ± 0% 2.801n ± 0% -41.68% (p=0.000 n=20) Memclr/16 4.803n ± 0% 3.202n ± 0% -33.33% (p=0.000 n=20) Memclr/64 9.605n ± 0% 5.061n ± 1% -47.30% (p=0.000 n=20) Memclr/256 29.22n ± 0% 10.24n ± 0% -64.96% (p=0.000 n=20) Memclr/4096 413.4n ± 0% 106.9n ± 0% -74.14% (p=0.000 n=20) Memclr/65536 6.566µ ± 0% 1.673µ ± 0% -74.52% (p=0.000 n=20) Memclr/1M 104.95µ ± 0% 52.51µ ± 0% -49.97% (p=0.000 n=20) Memclr/4M 419.8µ ± 0% 209.9µ ± 0% -49.99% (p=0.000 n=20) Memclr/8M 839.6µ ± 0% 419.9µ ± 0% -49.98% (p=0.000 n=20) Memclr/16M 1687.6µ ± 0% 845.3µ ± 0% -49.91% (p=0.000 n=20) Memclr/64M 6.725m ± 0% 3.389m ± 0% -49.61% (p=0.000 n=20) MemclrUnaligned/0_5 6.003n ± 0% 4.581n ± 0% -23.69% (p=0.000 n=20) MemclrUnaligned/0_16 6.005n ± 0% 5.084n ± 0% -15.33% (p=0.000 n=20) MemclrUnaligned/0_64 10.810n ± 0% 6.229n ± 0% -42.38% (p=0.000 n=20) MemclrUnaligned/0_256 30.43n ± 0% 10.68n ± 0% -64.90% (p=0.000 n=20) MemclrUnaligned/0_4096 414.8n ± 0% 107.1n ± 0% -74.18% (p=0.000 n=20) MemclrUnaligned/0_65536 6.566µ ± 0% 1.700µ ± 0% -74.11% (p=0.000 n=20) MemclrUnaligned/1_5 6.003n ± 0% 4.582n ± 0% -23.67% (p=0.000 n=20) MemclrUnaligned/1_16 11.610n ± 0% 5.080n ± 0% -56.24% (p=0.000 n=20) MemclrUnaligned/1_64 16.810n ± 0% 7.370n ± 0% -56.16% (p=0.000 n=20) MemclrUnaligned/1_256 36.42n ± 0% 12.95n ± 0% -64.44% (p=0.000 n=20) MemclrUnaligned/1_4096 420.6n ± 0% 114.6n ± 0% -72.75% (p=0.000 n=20) MemclrUnaligned/1_65536 6.573µ ± 0% 1.708µ ± 0% -74.01% (p=0.000 n=20) MemclrUnaligned/4_5 6.003n ± 0% 4.582n ± 0% -23.67% (p=0.000 n=20) MemclrUnaligned/4_16 10.410n ± 0% 5.069n ± 0% -51.30% (p=0.000 n=20) MemclrUnaligned/4_64 15.610n ± 0% 7.372n ± 0% -52.77% (p=0.000 n=20) MemclrUnaligned/4_256 35.22n ± 0% 12.95n ± 0% -63.23% (p=0.000 n=20) MemclrUnaligned/4_4096 419.4n ± 0% 114.6n ± 0% -72.68% (p=0.000 n=20) MemclrUnaligned/4_65536 6.571µ ± 0% 1.708µ ± 0% -74.01% (p=0.000 n=20) MemclrUnaligned/7_5 6.003n ± 0% 4.581n ± 0% -23.69% (p=0.000 n=20) MemclrUnaligned/7_16 8.855n ± 0% 5.079n ± 0% -42.65% (p=0.000 n=20) MemclrUnaligned/7_64 14.010n ± 0% 7.370n ± 0% -47.39% (p=0.000 n=20) MemclrUnaligned/7_256 33.62n ± 0% 12.95n ± 0% -61.48% (p=0.000 n=20) MemclrUnaligned/7_4096 417.8n ± 0% 114.7n ± 0% -72.56% (p=0.000 n=20) MemclrUnaligned/7_65536 6.570µ ± 0% 1.708µ ± 0% -74.00% (p=0.000 n=20) MemclrUnaligned/0_1M 104.96µ ± 0% 52.51µ ± 0% -49.97% (p=0.000 n=20) MemclrUnaligned/0_4M 419.8µ ± 0% 209.9µ ± 0% -49.99% (p=0.000 n=20) MemclrUnaligned/0_8M 839.5µ ± 0% 419.8µ ± 0% -49.99% (p=0.000 n=20) MemclrUnaligned/0_16M 1687.9µ ± 0% 844.9µ ± 0% -49.94% (p=0.000 n=20) MemclrUnaligned/0_64M 6.725m ± 0% 3.382m ± 0% -49.72% (p=0.000 n=20) MemclrUnaligned/1_1M 104.97µ ± 0% 52.51µ ± 0% -49.97% (p=0.000 n=20) MemclrUnaligned/1_4M 419.8µ ± 0% 210.0µ ± 0% -49.97% (p=0.000 n=20) MemclrUnaligned/1_8M 839.5µ ± 0% 419.8µ ± 0% -50.00% (p=0.000 n=20) MemclrUnaligned/1_16M 1687.6µ ± 0% 844.2µ ± 0% -49.97% (p=0.000 n=20) MemclrUnaligned/1_64M 6.724m ± 0% 3.367m ± 0% -49.93% (p=0.000 n=20) MemclrUnaligned/4_1M 104.97µ ± 0% 52.51µ ± 0% -49.97% (p=0.000 n=20) MemclrUnaligned/4_4M 419.8µ ± 0% 210.0µ ± 0% -49.97% (p=0.000 n=20) MemclrUnaligned/4_8M 839.5µ ± 0% 419.8µ ± 0% -50.00% (p=0.000 n=20) MemclrUnaligned/4_16M 1687.5µ ± 0% 844.4µ ± 0% -49.96% (p=0.000 n=20) MemclrUnaligned/4_64M 6.725m ± 0% 3.366m ± 0% -49.95% (p=0.000 n=20) MemclrUnaligned/7_1M 104.97µ ± 0% 52.51µ ± 0% -49.97% (p=0.000 n=20) MemclrUnaligned/7_4M 419.8µ ± 0% 210.0µ ± 0% -49.97% (p=0.000 n=20) MemclrUnaligned/7_8M 839.5µ ± 0% 419.8µ ± 0% -50.00% (p=0.000 n=20) MemclrUnaligned/7_16M 1687.9µ ± 0% 844.3µ ± 0% -49.98% (p=0.000 n=20) MemclrUnaligned/7_64M 6.724m ± 0% 3.362m ± 0% -50.00% (p=0.000 n=20) geomean 4.659µ 2.114µ -54.61% Change-Id: If0174e4cd8be5e17ad146698508a966158fe83e8 Reviewed-on: https://go-review.googlesource.com/c/go/+/589539 Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Michael Pratt <mpratt@google.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn>
This commit is contained in:
parent
1bc795633e
commit
501b389efc
@ -5,36 +5,131 @@
|
||||
#include "go_asm.h"
|
||||
#include "textflag.h"
|
||||
|
||||
// Register map
|
||||
//
|
||||
// R4: ptr
|
||||
// R5: n
|
||||
// R6: ptrend
|
||||
// R7: tmp
|
||||
|
||||
// Algorithm:
|
||||
//
|
||||
// 1. when count <= 64 bytes, memory alignment check is omitted.
|
||||
// The handling is divided into distinct cases based on the size
|
||||
// of count: clr_0, clr_1, clr_2, clr_3, clr_4, clr_5through7,
|
||||
// clr_8, clr_9through16, clr_17through32, and clr_33through64.
|
||||
//
|
||||
// 2. when count > 64 bytes, memory alignment check is performed.
|
||||
// Unaligned bytes are processed first (that is, 8-(ptr&7)), and
|
||||
// then a 64-byte loop is executed to zero out memory.
|
||||
// When the number of remaining bytes not cleared is n < 64 bytes,
|
||||
// a tail processing is performed, invoking the corresponding case
|
||||
// based on the size of n.
|
||||
//
|
||||
// ptr newptr ptrend
|
||||
// | |<----count after correction---->|
|
||||
// |<-------------count before correction---------->|
|
||||
// |<--8-(ptr&7)-->| |<---64 bytes--->|
|
||||
// +------------------------------------------------+
|
||||
// | Head | Body | Tail |
|
||||
// +---------------+---------------+----------------+
|
||||
// newptr = ptr - (ptr & 7) + 8
|
||||
// count = count - 8 + (ptr & 7)
|
||||
|
||||
// func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
|
||||
TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB),NOSPLIT,$0-16
|
||||
BEQ R5, clr_0
|
||||
ADDV R4, R5, R6
|
||||
|
||||
// if less than 8 bytes, do one byte at a time
|
||||
SGTU $8, R5, R8
|
||||
BNE R8, out
|
||||
tail:
|
||||
// <=64 bytes, clear directly, not check aligned
|
||||
SGTU $2, R5, R7
|
||||
BNE R7, clr_1
|
||||
SGTU $3, R5, R7
|
||||
BNE R7, clr_2
|
||||
SGTU $4, R5, R7
|
||||
BNE R7, clr_3
|
||||
SGTU $5, R5, R7
|
||||
BNE R7, clr_4
|
||||
SGTU $8, R5, R7
|
||||
BNE R7, clr_5through7
|
||||
SGTU $9, R5, R7
|
||||
BNE R7, clr_8
|
||||
SGTU $17, R5, R7
|
||||
BNE R7, clr_9through16
|
||||
SGTU $33, R5, R7
|
||||
BNE R7, clr_17through32
|
||||
SGTU $65, R5, R7
|
||||
BNE R7, clr_33through64
|
||||
|
||||
// do one byte at a time until 8-aligned
|
||||
AND $7, R4, R8
|
||||
BEQ R8, words
|
||||
MOVB R0, (R4)
|
||||
ADDV $1, R4
|
||||
JMP -4(PC)
|
||||
// n > 64 bytes, check aligned
|
||||
AND $7, R4, R7
|
||||
BEQ R7, body
|
||||
|
||||
words:
|
||||
// do 8 bytes at a time if there is room
|
||||
ADDV $-7, R6, R5
|
||||
|
||||
PCALIGN $16
|
||||
SGTU R5, R4, R8
|
||||
BEQ R8, out
|
||||
head:
|
||||
MOVV R0, (R4)
|
||||
ADDV $8, R4
|
||||
JMP -4(PC)
|
||||
SUBV R7, R4
|
||||
ADDV R7, R5
|
||||
ADDV $8, R4 // newptr = ptr + (8 - (ptr & 7))
|
||||
SUBV $8, R5 // newn = n - (8 - (ptr & 7))
|
||||
SGTU $65, R5, R7
|
||||
BNE R7, clr_33through64
|
||||
|
||||
out:
|
||||
BEQ R4, R6, done
|
||||
MOVB R0, (R4)
|
||||
ADDV $1, R4
|
||||
JMP -3(PC)
|
||||
done:
|
||||
body:
|
||||
MOVV R0, (R4)
|
||||
MOVV R0, 8(R4)
|
||||
MOVV R0, 16(R4)
|
||||
MOVV R0, 24(R4)
|
||||
MOVV R0, 32(R4)
|
||||
MOVV R0, 40(R4)
|
||||
MOVV R0, 48(R4)
|
||||
MOVV R0, 56(R4)
|
||||
ADDV $-64, R5
|
||||
ADDV $64, R4
|
||||
SGTU $65, R5, R7
|
||||
BEQ R7, body
|
||||
BEQ R5, clr_0
|
||||
JMP tail
|
||||
|
||||
clr_0:
|
||||
RET
|
||||
clr_1:
|
||||
MOVB R0, (R4)
|
||||
RET
|
||||
clr_2:
|
||||
MOVH R0, (R4)
|
||||
RET
|
||||
clr_3:
|
||||
MOVH R0, (R4)
|
||||
MOVB R0, 2(R4)
|
||||
RET
|
||||
clr_4:
|
||||
MOVW R0, (R4)
|
||||
RET
|
||||
clr_5through7:
|
||||
MOVW R0, (R4)
|
||||
MOVW R0, -4(R6)
|
||||
RET
|
||||
clr_8:
|
||||
MOVV R0, (R4)
|
||||
RET
|
||||
clr_9through16:
|
||||
MOVV R0, (R4)
|
||||
MOVV R0, -8(R6)
|
||||
RET
|
||||
clr_17through32:
|
||||
MOVV R0, (R4)
|
||||
MOVV R0, 8(R4)
|
||||
MOVV R0, -16(R6)
|
||||
MOVV R0, -8(R6)
|
||||
RET
|
||||
clr_33through64:
|
||||
MOVV R0, (R4)
|
||||
MOVV R0, 8(R4)
|
||||
MOVV R0, 16(R4)
|
||||
MOVV R0, 24(R4)
|
||||
MOVV R0, -32(R6)
|
||||
MOVV R0, -24(R6)
|
||||
MOVV R0, -16(R6)
|
||||
MOVV R0, -8(R6)
|
||||
RET
|
||||
|
Loading…
Reference in New Issue
Block a user