1
0
mirror of https://github.com/golang/go synced 2024-11-21 11:54:39 -07:00

runtime,cmd/internal/obj/arm64,cmd/internal/objabi,cmd/link/internal/arm64: enable conditional branches on target outside of asm file

Results of running runtime hash benchmarks:
goos: linux
goarch: arm64
pkg: runtime
                       │ master_noaes_runtime_hash.txt │ arm64-condbr19_noaes_runtime_hash.txt │
                       │            sec/op             │     sec/op      vs base               │
Hash5-4                                    9.628n ± 0%      8.858n ± 0%  -8.00% (p=0.000 n=25)
Hash16-4                                   9.627n ± 0%      9.627n ± 0%       ~ (p=0.117 n=25)
Hash64-4                                   15.03n ± 0%      14.25n ± 0%  -5.19% (p=0.000 n=25)
Hash1024-4                                 90.19n ± 0%      88.19n ± 0%  -2.22% (p=0.000 n=25)
Hash65536-4                                5.058µ ± 0%      5.192µ ± 0%  +2.65% (p=0.000 n=25)
HashStringSpeed-4                          19.00n ± 3%      18.46n ± 4%       ~ (p=0.096 n=25)
HashBytesSpeed-4                           65.26n ± 0%      64.36n ± 0%  -1.38% (p=0.000 n=25)
HashInt32Speed-4                           12.46n ± 0%      12.50n ± 1%       ~ (p=0.829 n=25)
HashInt64Speed-4                           13.01n ± 3%      12.47n ± 3%  -4.15% (p=0.000 n=25)
HashStringArraySpeed-4                     43.45n ± 2%      41.55n ± 3%  -4.37% (p=0.000 n=25)
FastrandHashiter-4                         11.72n ± 1%      11.81n ± 1%       ~ (p=0.079 n=25)
geomean                                    33.77n           33.00n       -2.26%

            │ master_noaes_runtime_hash.txt │ arm64-condbr19_noaes_runtime_hash.txt │
            │              B/s              │      B/s        vs base               │
Hash5-4                        495.3Mi ± 0%     538.3Mi ± 0%  +8.69% (p=0.000 n=25)
Hash16-4                       1.548Gi ± 0%     1.548Gi ± 0%  +0.00% (p=0.025 n=25)
Hash64-4                       3.966Gi ± 0%     4.183Gi ± 0%  +5.47% (p=0.000 n=25)
Hash1024-4                     10.57Gi ± 0%     10.81Gi ± 0%  +2.27% (p=0.000 n=25)
Hash65536-4                    12.07Gi ± 0%     11.76Gi ± 0%  -2.57% (p=0.000 n=25)
geomean                        3.279Gi          3.367Gi       +2.70%

These results are obtained with cpu.ARM64.HasAES == false

Change-Id: If282267b9b2dcf474516cb33bfdbda4ee35bb8fa
This commit is contained in:
ArsenySamoylov 2024-08-12 13:30:20 +03:00
parent e705a2d16e
commit c3302d38d0
7 changed files with 119 additions and 78 deletions

View File

@ -2691,6 +2691,11 @@ func cmp(a int, b int) bool {
return true
}
case C_SBRA:
if b == C_ADDR {
return true
}
case C_LBRA:
if b == C_SBRA {
return true
@ -4260,7 +4265,18 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
o1 = c.opirr(p, p.As)
o1 |= uint32(p.From.Reg & 31)
o1 |= uint32(c.brdist(p, 0, 19, 2) << 5)
if p.To.Sym == nil {
o1 |= uint32(c.brdist(p, 0, 19, 2) << 5)
break
}
rel := obj.Addrel(c.cursym)
rel.Off = int32(c.pc)
rel.Siz = 4
rel.Sym = p.To.Sym
rel.Add = p.To.Offset
rel.Type = objabi.R_AARCH64_CONDBR19
case 40: /* tbz */
o1 = c.opirr(p, p.As)

View File

@ -38,3 +38,17 @@ func TestMOVK(t *testing.T) {
t.Errorf("Got %x want %x\n", x, want)
}
}
func testcondbr19() uint64
func testcondbr19_target() uint64 {
return 42
}
// TestCondBr19 makes sure that conditional branch on labels outside of *arm64.s file works fine
func TestCondBr19(t *testing.T) {
x := testcondbr19()
want := uint64(42)
if x != want {
t.Errorf("Got %d want %d\n", x, want)
}
}

View File

@ -37,3 +37,8 @@ TEXT ·testmovk(SB), NOSPLIT, $0-8
MOVK $(40000<<48), R0
MOVD R0, ret+0(FP)
RET
// testcondbr19() uint64
TEXT ·testcondbr19(SB), NOSPLIT, $0-8
MOVD $0, R0
CBZ R0, ·testcondbr19_target(SB)

View File

@ -131,6 +131,7 @@ const (
// inherently processor specific.
// Arm64.
R_AARCH64_CONDBR19
// Set a MOV[NZ] immediate field to bits [15:0] of the offset from the thread
// local base to the thread local variable defined by the referenced (thread

View File

@ -40,74 +40,75 @@ func _() {
_ = x[R_JMPMIPS-30]
_ = x[R_DWARFSECREF-31]
_ = x[R_DWARFFILEREF-32]
_ = x[R_ARM64_TLS_LE-33]
_ = x[R_ARM64_TLS_IE-34]
_ = x[R_ARM64_GOTPCREL-35]
_ = x[R_ARM64_GOT-36]
_ = x[R_ARM64_PCREL-37]
_ = x[R_ARM64_PCREL_LDST8-38]
_ = x[R_ARM64_PCREL_LDST16-39]
_ = x[R_ARM64_PCREL_LDST32-40]
_ = x[R_ARM64_PCREL_LDST64-41]
_ = x[R_ARM64_LDST8-42]
_ = x[R_ARM64_LDST16-43]
_ = x[R_ARM64_LDST32-44]
_ = x[R_ARM64_LDST64-45]
_ = x[R_ARM64_LDST128-46]
_ = x[R_POWER_TLS_LE-47]
_ = x[R_POWER_TLS_IE-48]
_ = x[R_POWER_TLS-49]
_ = x[R_POWER_TLS_IE_PCREL34-50]
_ = x[R_POWER_TLS_LE_TPREL34-51]
_ = x[R_ADDRPOWER_DS-52]
_ = x[R_ADDRPOWER_GOT-53]
_ = x[R_ADDRPOWER_GOT_PCREL34-54]
_ = x[R_ADDRPOWER_PCREL-55]
_ = x[R_ADDRPOWER_TOCREL-56]
_ = x[R_ADDRPOWER_TOCREL_DS-57]
_ = x[R_ADDRPOWER_D34-58]
_ = x[R_ADDRPOWER_PCREL34-59]
_ = x[R_RISCV_JAL-60]
_ = x[R_RISCV_JAL_TRAMP-61]
_ = x[R_RISCV_CALL-62]
_ = x[R_RISCV_PCREL_ITYPE-63]
_ = x[R_RISCV_PCREL_STYPE-64]
_ = x[R_RISCV_TLS_IE-65]
_ = x[R_RISCV_TLS_LE-66]
_ = x[R_RISCV_GOT_HI20-67]
_ = x[R_RISCV_PCREL_HI20-68]
_ = x[R_RISCV_PCREL_LO12_I-69]
_ = x[R_RISCV_PCREL_LO12_S-70]
_ = x[R_RISCV_BRANCH-71]
_ = x[R_RISCV_RVC_BRANCH-72]
_ = x[R_RISCV_RVC_JUMP-73]
_ = x[R_PCRELDBL-74]
_ = x[R_LOONG64_ADDR_HI-75]
_ = x[R_LOONG64_ADDR_LO-76]
_ = x[R_LOONG64_TLS_LE_HI-77]
_ = x[R_LOONG64_TLS_LE_LO-78]
_ = x[R_CALLLOONG64-79]
_ = x[R_LOONG64_TLS_IE_HI-80]
_ = x[R_LOONG64_TLS_IE_LO-81]
_ = x[R_LOONG64_GOT_HI-82]
_ = x[R_LOONG64_GOT_LO-83]
_ = x[R_LOONG64_ADD64-84]
_ = x[R_LOONG64_SUB64-85]
_ = x[R_JMP16LOONG64-86]
_ = x[R_JMP21LOONG64-87]
_ = x[R_JMPLOONG64-88]
_ = x[R_ADDRMIPSU-89]
_ = x[R_ADDRMIPSTLS-90]
_ = x[R_ADDRCUOFF-91]
_ = x[R_WASMIMPORT-92]
_ = x[R_XCOFFREF-93]
_ = x[R_PEIMAGEOFF-94]
_ = x[R_INITORDER-95]
_ = x[R_AARCH64_CONDBR19-33]
_ = x[R_ARM64_TLS_LE-34]
_ = x[R_ARM64_TLS_IE-35]
_ = x[R_ARM64_GOTPCREL-36]
_ = x[R_ARM64_GOT-37]
_ = x[R_ARM64_PCREL-38]
_ = x[R_ARM64_PCREL_LDST8-39]
_ = x[R_ARM64_PCREL_LDST16-40]
_ = x[R_ARM64_PCREL_LDST32-41]
_ = x[R_ARM64_PCREL_LDST64-42]
_ = x[R_ARM64_LDST8-43]
_ = x[R_ARM64_LDST16-44]
_ = x[R_ARM64_LDST32-45]
_ = x[R_ARM64_LDST64-46]
_ = x[R_ARM64_LDST128-47]
_ = x[R_POWER_TLS_LE-48]
_ = x[R_POWER_TLS_IE-49]
_ = x[R_POWER_TLS-50]
_ = x[R_POWER_TLS_IE_PCREL34-51]
_ = x[R_POWER_TLS_LE_TPREL34-52]
_ = x[R_ADDRPOWER_DS-53]
_ = x[R_ADDRPOWER_GOT-54]
_ = x[R_ADDRPOWER_GOT_PCREL34-55]
_ = x[R_ADDRPOWER_PCREL-56]
_ = x[R_ADDRPOWER_TOCREL-57]
_ = x[R_ADDRPOWER_TOCREL_DS-58]
_ = x[R_ADDRPOWER_D34-59]
_ = x[R_ADDRPOWER_PCREL34-60]
_ = x[R_RISCV_JAL-61]
_ = x[R_RISCV_JAL_TRAMP-62]
_ = x[R_RISCV_CALL-63]
_ = x[R_RISCV_PCREL_ITYPE-64]
_ = x[R_RISCV_PCREL_STYPE-65]
_ = x[R_RISCV_TLS_IE-66]
_ = x[R_RISCV_TLS_LE-67]
_ = x[R_RISCV_GOT_HI20-68]
_ = x[R_RISCV_PCREL_HI20-69]
_ = x[R_RISCV_PCREL_LO12_I-70]
_ = x[R_RISCV_PCREL_LO12_S-71]
_ = x[R_RISCV_BRANCH-72]
_ = x[R_RISCV_RVC_BRANCH-73]
_ = x[R_RISCV_RVC_JUMP-74]
_ = x[R_PCRELDBL-75]
_ = x[R_LOONG64_ADDR_HI-76]
_ = x[R_LOONG64_ADDR_LO-77]
_ = x[R_LOONG64_TLS_LE_HI-78]
_ = x[R_LOONG64_TLS_LE_LO-79]
_ = x[R_CALLLOONG64-80]
_ = x[R_LOONG64_TLS_IE_HI-81]
_ = x[R_LOONG64_TLS_IE_LO-82]
_ = x[R_LOONG64_GOT_HI-83]
_ = x[R_LOONG64_GOT_LO-84]
_ = x[R_LOONG64_ADD64-85]
_ = x[R_LOONG64_SUB64-86]
_ = x[R_JMP16LOONG64-87]
_ = x[R_JMP21LOONG64-88]
_ = x[R_JMPLOONG64-89]
_ = x[R_ADDRMIPSU-90]
_ = x[R_ADDRMIPSTLS-91]
_ = x[R_ADDRCUOFF-92]
_ = x[R_WASMIMPORT-93]
_ = x[R_XCOFFREF-94]
_ = x[R_PEIMAGEOFF-95]
_ = x[R_INITORDER-96]
}
const _RelocType_name = "R_ADDRR_ADDRPOWERR_ADDRARM64R_ADDRMIPSR_ADDROFFR_SIZER_CALLR_CALLARMR_CALLARM64R_CALLINDR_CALLPOWERR_CALLMIPSR_CONSTR_PCRELR_TLS_LER_TLS_IER_GOTOFFR_PLT0R_PLT1R_PLT2R_USEFIELDR_USETYPER_USEIFACER_USEIFACEMETHODR_USENAMEDMETHODR_METHODOFFR_KEEPR_POWER_TOCR_GOTPCRELR_JMPMIPSR_DWARFSECREFR_DWARFFILEREFR_ARM64_TLS_LER_ARM64_TLS_IER_ARM64_GOTPCRELR_ARM64_GOTR_ARM64_PCRELR_ARM64_PCREL_LDST8R_ARM64_PCREL_LDST16R_ARM64_PCREL_LDST32R_ARM64_PCREL_LDST64R_ARM64_LDST8R_ARM64_LDST16R_ARM64_LDST32R_ARM64_LDST64R_ARM64_LDST128R_POWER_TLS_LER_POWER_TLS_IER_POWER_TLSR_POWER_TLS_IE_PCREL34R_POWER_TLS_LE_TPREL34R_ADDRPOWER_DSR_ADDRPOWER_GOTR_ADDRPOWER_GOT_PCREL34R_ADDRPOWER_PCRELR_ADDRPOWER_TOCRELR_ADDRPOWER_TOCREL_DSR_ADDRPOWER_D34R_ADDRPOWER_PCREL34R_RISCV_JALR_RISCV_JAL_TRAMPR_RISCV_CALLR_RISCV_PCREL_ITYPER_RISCV_PCREL_STYPER_RISCV_TLS_IER_RISCV_TLS_LER_RISCV_GOT_HI20R_RISCV_PCREL_HI20R_RISCV_PCREL_LO12_IR_RISCV_PCREL_LO12_SR_RISCV_BRANCHR_RISCV_RVC_BRANCHR_RISCV_RVC_JUMPR_PCRELDBLR_LOONG64_ADDR_HIR_LOONG64_ADDR_LOR_LOONG64_TLS_LE_HIR_LOONG64_TLS_LE_LOR_CALLLOONG64R_LOONG64_TLS_IE_HIR_LOONG64_TLS_IE_LOR_LOONG64_GOT_HIR_LOONG64_GOT_LOR_LOONG64_ADD64R_LOONG64_SUB64R_JMP16LOONG64R_JMP21LOONG64R_JMPLOONG64R_ADDRMIPSUR_ADDRMIPSTLSR_ADDRCUOFFR_WASMIMPORTR_XCOFFREFR_PEIMAGEOFFR_INITORDER"
const _RelocType_name = "R_ADDRR_ADDRPOWERR_ADDRARM64R_ADDRMIPSR_ADDROFFR_SIZER_CALLR_CALLARMR_CALLARM64R_CALLINDR_CALLPOWERR_CALLMIPSR_CONSTR_PCRELR_TLS_LER_TLS_IER_GOTOFFR_PLT0R_PLT1R_PLT2R_USEFIELDR_USETYPER_USEIFACER_USEIFACEMETHODR_USENAMEDMETHODR_METHODOFFR_KEEPR_POWER_TOCR_GOTPCRELR_JMPMIPSR_DWARFSECREFR_DWARFFILEREFR_AARCH64_CONDBR19R_ARM64_TLS_LER_ARM64_TLS_IER_ARM64_GOTPCRELR_ARM64_GOTR_ARM64_PCRELR_ARM64_PCREL_LDST8R_ARM64_PCREL_LDST16R_ARM64_PCREL_LDST32R_ARM64_PCREL_LDST64R_ARM64_LDST8R_ARM64_LDST16R_ARM64_LDST32R_ARM64_LDST64R_ARM64_LDST128R_POWER_TLS_LER_POWER_TLS_IER_POWER_TLSR_POWER_TLS_IE_PCREL34R_POWER_TLS_LE_TPREL34R_ADDRPOWER_DSR_ADDRPOWER_GOTR_ADDRPOWER_GOT_PCREL34R_ADDRPOWER_PCRELR_ADDRPOWER_TOCRELR_ADDRPOWER_TOCREL_DSR_ADDRPOWER_D34R_ADDRPOWER_PCREL34R_RISCV_JALR_RISCV_JAL_TRAMPR_RISCV_CALLR_RISCV_PCREL_ITYPER_RISCV_PCREL_STYPER_RISCV_TLS_IER_RISCV_TLS_LER_RISCV_GOT_HI20R_RISCV_PCREL_HI20R_RISCV_PCREL_LO12_IR_RISCV_PCREL_LO12_SR_RISCV_BRANCHR_RISCV_RVC_BRANCHR_RISCV_RVC_JUMPR_PCRELDBLR_LOONG64_ADDR_HIR_LOONG64_ADDR_LOR_LOONG64_TLS_LE_HIR_LOONG64_TLS_LE_LOR_CALLLOONG64R_LOONG64_TLS_IE_HIR_LOONG64_TLS_IE_LOR_LOONG64_GOT_HIR_LOONG64_GOT_LOR_LOONG64_ADD64R_LOONG64_SUB64R_JMP16LOONG64R_JMP21LOONG64R_JMPLOONG64R_ADDRMIPSUR_ADDRMIPSTLSR_ADDRCUOFFR_WASMIMPORTR_XCOFFREFR_PEIMAGEOFFR_INITORDER"
var _RelocType_index = [...]uint16{0, 6, 17, 28, 38, 47, 53, 59, 68, 79, 88, 99, 109, 116, 123, 131, 139, 147, 153, 159, 165, 175, 184, 194, 210, 226, 237, 243, 254, 264, 273, 286, 300, 314, 328, 344, 355, 368, 387, 407, 427, 447, 460, 474, 488, 502, 517, 531, 545, 556, 578, 600, 614, 629, 652, 669, 687, 708, 723, 742, 753, 770, 782, 801, 820, 834, 848, 864, 882, 902, 922, 936, 954, 970, 980, 997, 1014, 1033, 1052, 1065, 1084, 1103, 1119, 1135, 1150, 1165, 1179, 1193, 1205, 1216, 1229, 1240, 1252, 1262, 1274, 1285}
var _RelocType_index = [...]uint16{0, 6, 17, 28, 38, 47, 53, 59, 68, 79, 88, 99, 109, 116, 123, 131, 139, 147, 153, 159, 165, 175, 184, 194, 210, 226, 237, 243, 254, 264, 273, 286, 300, 318, 332, 346, 362, 373, 386, 405, 425, 445, 465, 478, 492, 506, 520, 535, 549, 563, 574, 596, 618, 632, 647, 670, 687, 705, 726, 741, 760, 771, 788, 800, 819, 838, 852, 866, 882, 900, 920, 940, 954, 972, 988, 998, 1015, 1032, 1051, 1070, 1083, 1102, 1121, 1137, 1153, 1168, 1183, 1197, 1211, 1223, 1234, 1247, 1258, 1270, 1280, 1292, 1303}
func (i RelocType) String() string {
i -= 1

View File

@ -968,6 +968,18 @@ func archreloc(target *ld.Target, ldr *loader.Loader, syms *ld.ArchSyms, r loade
}
return val | ((t >> 2) & 0x03ffffff), noExtReloc, true
case objabi.R_AARCH64_CONDBR19:
var t int64
if ldr.SymType(rs) == sym.SDYNIMPORT {
t = (ldr.SymAddr(syms.PLT) + r.Add()) - (ldr.SymValue(s) + int64(r.Off()))
} else {
t = (ldr.SymAddr(rs) + r.Add()) - (ldr.SymValue(s) + int64(r.Off()))
}
if t >= 1<<20 || t < -1<<20 {
ldr.Errorf(s, "program too large, call relocation distance = %d", t)
}
return val | (((t >> 2) & 0x7ffff) << 5), noExtReloc, true
case objabi.R_ARM64_GOT:
if (val>>24)&0x9f == 0x90 {
// R_AARCH64_ADR_GOT_PAGE

View File

@ -532,7 +532,7 @@ CALLFN(·call1073741824, 1073741824)
// func memhash32(p unsafe.Pointer, h uintptr) uintptr
TEXT runtime·memhash32<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-24
MOVB runtime·useAeshash(SB), R10
CBZ R10, noaes
CBZ R10, runtime·memhash32Fallback<ABIInternal>(SB)
MOVD $runtime·aeskeysched+0(SB), R3
VEOR V0.B16, V0.B16, V0.B16
@ -548,13 +548,11 @@ TEXT runtime·memhash32<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-24
VMOV V0.D[0], R0
RET
noaes:
B runtime·memhash32Fallback<ABIInternal>(SB)
// func memhash64(p unsafe.Pointer, h uintptr) uintptr
TEXT runtime·memhash64<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-24
MOVB runtime·useAeshash(SB), R10
CBZ R10, noaes
CBZ R10, runtime·memhash64Fallback<ABIInternal>(SB)
MOVD $runtime·aeskeysched+0(SB), R3
VEOR V0.B16, V0.B16, V0.B16
@ -570,25 +568,19 @@ TEXT runtime·memhash64<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-24
VMOV V0.D[0], R0
RET
noaes:
B runtime·memhash64Fallback<ABIInternal>(SB)
// func memhash(p unsafe.Pointer, h, size uintptr) uintptr
TEXT runtime·memhash<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32
MOVB runtime·useAeshash(SB), R10
CBZ R10, noaes
CBZ R10, runtime·memhashFallback<ABIInternal>(SB)
B aeshashbody<>(SB)
noaes:
B runtime·memhashFallback<ABIInternal>(SB)
// func strhash(p unsafe.Pointer, h uintptr) uintptr
TEXT runtime·strhash<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-24
MOVB runtime·useAeshash(SB), R10
CBZ R10, noaes
CBZ R10, runtime·strhashFallback<ABIInternal>(SB)
LDP (R0), (R0, R2) // string data / length
B aeshashbody<>(SB)
noaes:
B runtime·strhashFallback<ABIInternal>(SB)
// R0: data
// R1: seed data