mirror of
https://github.com/golang/go
synced 2024-11-19 15:24:46 -07:00
b46d398887
Improve runtime memclr_arm64.s using ZVA feature to zero out memory when n is at least 64 bytes. Also add DCZID_EL0 system register to use in MRS instruction. Benchmark results of runtime/Memclr on Amberwing: name old time/op new time/op delta Memclr/5 12.7ns ± 0% 12.7ns ± 0% ~ (all equal) Memclr/16 12.7ns ± 0% 12.2ns ± 1% -4.13% (p=0.000 n=7+8) Memclr/64 14.0ns ± 0% 14.6ns ± 1% +4.29% (p=0.000 n=7+8) Memclr/256 23.7ns ± 0% 25.7ns ± 0% +8.44% (p=0.000 n=8+7) Memclr/4096 204ns ± 0% 74ns ± 0% -63.71% (p=0.000 n=8+8) Memclr/65536 2.89µs ± 0% 0.84µs ± 0% -70.91% (p=0.000 n=8+8) Memclr/1M 45.9µs ± 0% 17.0µs ± 0% -62.88% (p=0.000 n=8+8) Memclr/4M 184µs ± 0% 77µs ± 4% -57.94% (p=0.001 n=6+8) Memclr/8M 367µs ± 0% 144µs ± 1% -60.72% (p=0.000 n=7+8) Memclr/16M 734µs ± 0% 293µs ± 1% -60.09% (p=0.000 n=8+8) Memclr/64M 2.94ms ± 0% 1.23ms ± 0% -58.06% (p=0.000 n=7+8) GoMemclr/5 8.00ns ± 0% 8.79ns ± 0% +9.83% (p=0.000 n=8+8) GoMemclr/16 8.00ns ± 0% 7.60ns ± 0% -5.00% (p=0.000 n=8+8) GoMemclr/64 10.8ns ± 0% 10.4ns ± 0% -3.70% (p=0.000 n=8+8) GoMemclr/256 20.4ns ± 0% 21.2ns ± 0% +3.92% (p=0.000 n=8+8) name old speed new speed delta Memclr/5 394MB/s ± 0% 393MB/s ± 0% -0.28% (p=0.006 n=8+8) Memclr/16 1.26GB/s ± 0% 1.31GB/s ± 1% +4.07% (p=0.000 n=7+8) Memclr/64 4.57GB/s ± 0% 4.39GB/s ± 2% -3.91% (p=0.000 n=7+8) Memclr/256 10.8GB/s ± 0% 10.0GB/s ± 0% -7.95% (p=0.001 n=7+6) Memclr/4096 20.1GB/s ± 0% 55.3GB/s ± 0% +175.46% (p=0.000 n=8+8) Memclr/65536 22.6GB/s ± 0% 77.8GB/s ± 0% +243.63% (p=0.000 n=7+8) Memclr/1M 22.8GB/s ± 0% 61.5GB/s ± 0% +169.38% (p=0.000 n=8+8) Memclr/4M 22.8GB/s ± 0% 54.3GB/s ± 4% +137.85% (p=0.001 n=6+8) Memclr/8M 22.8GB/s ± 0% 58.1GB/s ± 1% +154.56% (p=0.000 n=7+8) Memclr/16M 22.8GB/s ± 0% 57.2GB/s ± 1% +150.54% (p=0.000 n=8+8) Memclr/64M 22.8GB/s ± 0% 54.4GB/s ± 0% +138.42% (p=0.000 n=7+8) GoMemclr/5 625MB/s ± 0% 569MB/s ± 0% -8.90% (p=0.000 n=7+8) GoMemclr/16 2.00GB/s ± 0% 2.10GB/s ± 0% +5.26% (p=0.000 n=8+8) GoMemclr/64 5.92GB/s ± 0% 6.15GB/s ± 0% +3.83% (p=0.000 n=7+8) GoMemclr/256 12.5GB/s ± 0% 12.1GB/s ± 0% -3.77% (p=0.000 n=8+7) Benchmark results of runtime/Memclr on Amberwing without ZVA: name old time/op new time/op delta Memclr/5 12.7ns ± 0% 12.8ns ± 0% +0.79% (p=0.008 n=5+5) Memclr/16 12.7ns ± 0% 12.7ns ± 0% ~ (p=0.444 n=5+5) Memclr/64 14.0ns ± 0% 14.4ns ± 0% +2.86% (p=0.008 n=5+5) Memclr/256 23.7ns ± 1% 19.2ns ± 0% -19.06% (p=0.008 n=5+5) Memclr/4096 203ns ± 0% 119ns ± 0% -41.38% (p=0.008 n=5+5) Memclr/65536 2.89µs ± 0% 1.66µs ± 0% -42.76% (p=0.008 n=5+5) Memclr/1M 45.9µs ± 0% 26.2µs ± 0% -42.82% (p=0.008 n=5+5) Memclr/4M 184µs ± 0% 105µs ± 0% -42.81% (p=0.008 n=5+5) Memclr/8M 367µs ± 0% 210µs ± 0% -42.76% (p=0.008 n=5+5) Memclr/16M 734µs ± 0% 420µs ± 0% -42.74% (p=0.008 n=5+5) Memclr/64M 2.94ms ± 0% 1.69ms ± 0% -42.46% (p=0.008 n=5+5) GoMemclr/5 8.00ns ± 0% 8.40ns ± 0% +5.00% (p=0.008 n=5+5) GoMemclr/16 8.00ns ± 0% 8.40ns ± 0% +5.00% (p=0.008 n=5+5) GoMemclr/64 10.8ns ± 0% 9.6ns ± 0% -11.02% (p=0.008 n=5+5) GoMemclr/256 20.4ns ± 0% 17.2ns ± 0% -15.69% (p=0.008 n=5+5) name old speed new speed delta Memclr/5 393MB/s ± 0% 391MB/s ± 0% -0.64% (p=0.008 n=5+5) Memclr/16 1.26GB/s ± 0% 1.26GB/s ± 0% -0.55% (p=0.008 n=5+5) Memclr/64 4.57GB/s ± 0% 4.44GB/s ± 0% -2.79% (p=0.008 n=5+5) Memclr/256 10.8GB/s ± 0% 13.3GB/s ± 0% +23.07% (p=0.016 n=4+5) Memclr/4096 20.1GB/s ± 0% 34.3GB/s ± 0% +70.91% (p=0.008 n=5+5) Memclr/65536 22.7GB/s ± 0% 39.6GB/s ± 0% +74.65% (p=0.008 n=5+5) Memclr/1M 22.8GB/s ± 0% 40.0GB/s ± 0% +74.88% (p=0.008 n=5+5) Memclr/4M 22.8GB/s ± 0% 39.9GB/s ± 0% +74.84% (p=0.008 n=5+5) Memclr/8M 22.9GB/s ± 0% 39.9GB/s ± 0% +74.71% (p=0.008 n=5+5) Memclr/16M 22.9GB/s ± 0% 39.9GB/s ± 0% +74.64% (p=0.008 n=5+5) Memclr/64M 22.8GB/s ± 0% 39.7GB/s ± 0% +73.79% (p=0.008 n=5+5) GoMemclr/5 625MB/s ± 0% 595MB/s ± 0% -4.77% (p=0.000 n=4+5) GoMemclr/16 2.00GB/s ± 0% 1.90GB/s ± 0% -4.77% (p=0.008 n=5+5) GoMemclr/64 5.92GB/s ± 0% 6.66GB/s ± 0% +12.48% (p=0.016 n=4+5) GoMemclr/256 12.5GB/s ± 0% 14.9GB/s ± 0% +18.95% (p=0.008 n=5+5) Fixes #22948 Change-Id: Iaae4e22391e25b54d299821bb7f8a81ac3986b93 Reviewed-on: https://go-review.googlesource.com/82055 Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
183 lines
3.5 KiB
ArmAsm
183 lines
3.5 KiB
ArmAsm
// Copyright 2014 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
#include "textflag.h"
|
|
|
|
// void runtime·memclrNoHeapPointers(void*, uintptr)
|
|
TEXT runtime·memclrNoHeapPointers(SB),NOSPLIT,$0-16
|
|
MOVD ptr+0(FP), R0
|
|
MOVD n+8(FP), R1
|
|
|
|
CMP $16, R1
|
|
// If n is equal to 16 bytes, use zero_exact_16 to zero
|
|
BEQ zero_exact_16
|
|
|
|
// If n is greater than 16 bytes, use zero_by_16 to zero
|
|
BHI zero_by_16
|
|
|
|
// n is less than 16 bytes
|
|
ADD R1, R0, R7
|
|
TBZ $3, R1, less_than_8
|
|
MOVD ZR, (R0)
|
|
MOVD ZR, -8(R7)
|
|
RET
|
|
|
|
less_than_8:
|
|
TBZ $2, R1, less_than_4
|
|
MOVW ZR, (R0)
|
|
MOVW ZR, -4(R7)
|
|
RET
|
|
|
|
less_than_4:
|
|
CBZ R1, ending
|
|
MOVB ZR, (R0)
|
|
TBZ $1, R1, ending
|
|
MOVH ZR, -2(R7)
|
|
|
|
ending:
|
|
RET
|
|
|
|
zero_exact_16:
|
|
// n is exactly 16 bytes
|
|
STP (ZR, ZR), (R0)
|
|
RET
|
|
|
|
zero_by_16:
|
|
// n greater than 16 bytes, check if the start address is aligned
|
|
NEG R0, R4
|
|
ANDS $15, R4, R4
|
|
// Try zeroing using zva if the start address is aligned with 16
|
|
BEQ try_zva
|
|
|
|
// Non-aligned store
|
|
STP (ZR, ZR), (R0)
|
|
// Make the destination aligned
|
|
SUB R4, R1, R1
|
|
ADD R4, R0, R0
|
|
B try_zva
|
|
|
|
tail_maybe_long:
|
|
CMP $64, R1
|
|
BHS no_zva
|
|
|
|
tail63:
|
|
ANDS $48, R1, R3
|
|
BEQ last16
|
|
CMPW $32, R3
|
|
BEQ last48
|
|
BLT last32
|
|
STP.P (ZR, ZR), 16(R0)
|
|
last48:
|
|
STP.P (ZR, ZR), 16(R0)
|
|
last32:
|
|
STP.P (ZR, ZR), 16(R0)
|
|
// The last store length is at most 16, so it is safe to use
|
|
// stp to write last 16 bytes
|
|
last16:
|
|
ANDS $15, R1, R1
|
|
CBZ R1, last_end
|
|
ADD R1, R0, R0
|
|
STP (ZR, ZR), -16(R0)
|
|
last_end:
|
|
RET
|
|
|
|
no_zva:
|
|
SUB $16, R0, R0
|
|
SUB $64, R1, R1
|
|
|
|
loop_64:
|
|
STP (ZR, ZR), 16(R0)
|
|
STP (ZR, ZR), 32(R0)
|
|
STP (ZR, ZR), 48(R0)
|
|
STP.W (ZR, ZR), 64(R0)
|
|
SUBS $64, R1, R1
|
|
BGE loop_64
|
|
ANDS $63, R1, ZR
|
|
ADD $16, R0, R0
|
|
BNE tail63
|
|
RET
|
|
|
|
try_zva:
|
|
// Try using the ZVA feature to zero entire cache lines
|
|
// It is not meaningful to use ZVA if the block size is less than 64,
|
|
// so make sure that n is greater than or equal to 64
|
|
CMP $63, R1
|
|
BLE tail63
|
|
|
|
CMP $128, R1
|
|
// Ensure n is at least 128 bytes, so that there is enough to copy after
|
|
// alignment.
|
|
BLT no_zva
|
|
// Check if ZVA is allowed from user code, and if so get the block size
|
|
MOVW block_size<>(SB), R5
|
|
TBNZ $31, R5, no_zva
|
|
CBNZ R5, zero_by_line
|
|
// DCZID_EL0 bit assignments
|
|
// [63:5] Reserved
|
|
// [4] DZP, if bit set DC ZVA instruction is prohibited, else permitted
|
|
// [3:0] log2 of the block size in words, eg. if it returns 0x4 then block size is 16 words
|
|
MRS DCZID_EL0, R3
|
|
TBZ $4, R3, init
|
|
// ZVA not available
|
|
MOVW $~0, R5
|
|
MOVW R5, block_size<>(SB)
|
|
B no_zva
|
|
|
|
init:
|
|
MOVW $4, R9
|
|
ANDW $15, R3, R5
|
|
LSLW R5, R9, R5
|
|
MOVW R5, block_size<>(SB)
|
|
|
|
ANDS $63, R5, R9
|
|
// Block size is less than 64.
|
|
BNE no_zva
|
|
|
|
zero_by_line:
|
|
CMP R5, R1
|
|
// Not enough memory to reach alignment
|
|
BLO no_zva
|
|
SUB $1, R5, R6
|
|
NEG R0, R4
|
|
ANDS R6, R4, R4
|
|
// Already aligned
|
|
BEQ aligned
|
|
|
|
// check there is enough to copy after alignment
|
|
SUB R4, R1, R3
|
|
|
|
// Check that the remaining length to ZVA after alignment
|
|
// is greater than 64.
|
|
CMP $64, R3
|
|
CCMP GE, R3, R5, $10 // condition code GE, NZCV=0b1010
|
|
BLT no_zva
|
|
|
|
// We now have at least 64 bytes to zero, update n
|
|
MOVD R3, R1
|
|
|
|
loop_zva_prolog:
|
|
STP (ZR, ZR), (R0)
|
|
STP (ZR, ZR), 16(R0)
|
|
STP (ZR, ZR), 32(R0)
|
|
SUBS $64, R4, R4
|
|
STP (ZR, ZR), 48(R0)
|
|
ADD $64, R0, R0
|
|
BGE loop_zva_prolog
|
|
|
|
ADD R4, R0, R0
|
|
|
|
aligned:
|
|
SUB R5, R1, R1
|
|
|
|
loop_zva:
|
|
WORD $0xd50b7420 // DC ZVA, R0
|
|
ADD R5, R0, R0
|
|
SUBS R5, R1, R1
|
|
BHS loop_zva
|
|
ANDS R6, R1, R1
|
|
BNE tail_maybe_long
|
|
RET
|
|
|
|
GLOBL block_size<>(SB), NOPTR, $8
|