From c3302d38d0c27491a1ea1e77bb4b424586507750 Mon Sep 17 00:00:00 2001 From: ArsenySamoylov Date: Mon, 12 Aug 2024 13:30:20 +0300 Subject: [PATCH] runtime,cmd/internal/obj/arm64,cmd/internal/objabi,cmd/link/internal/arm64: enable conditional branches on target outside of asm file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Results of running runtime hash benchmarks: goos: linux goarch: arm64 pkg: runtime │ master_noaes_runtime_hash.txt │ arm64-condbr19_noaes_runtime_hash.txt │ │ sec/op │ sec/op vs base │ Hash5-4 9.628n ± 0% 8.858n ± 0% -8.00% (p=0.000 n=25) Hash16-4 9.627n ± 0% 9.627n ± 0% ~ (p=0.117 n=25) Hash64-4 15.03n ± 0% 14.25n ± 0% -5.19% (p=0.000 n=25) Hash1024-4 90.19n ± 0% 88.19n ± 0% -2.22% (p=0.000 n=25) Hash65536-4 5.058µ ± 0% 5.192µ ± 0% +2.65% (p=0.000 n=25) HashStringSpeed-4 19.00n ± 3% 18.46n ± 4% ~ (p=0.096 n=25) HashBytesSpeed-4 65.26n ± 0% 64.36n ± 0% -1.38% (p=0.000 n=25) HashInt32Speed-4 12.46n ± 0% 12.50n ± 1% ~ (p=0.829 n=25) HashInt64Speed-4 13.01n ± 3% 12.47n ± 3% -4.15% (p=0.000 n=25) HashStringArraySpeed-4 43.45n ± 2% 41.55n ± 3% -4.37% (p=0.000 n=25) FastrandHashiter-4 11.72n ± 1% 11.81n ± 1% ~ (p=0.079 n=25) geomean 33.77n 33.00n -2.26% │ master_noaes_runtime_hash.txt │ arm64-condbr19_noaes_runtime_hash.txt │ │ B/s │ B/s vs base │ Hash5-4 495.3Mi ± 0% 538.3Mi ± 0% +8.69% (p=0.000 n=25) Hash16-4 1.548Gi ± 0% 1.548Gi ± 0% +0.00% (p=0.025 n=25) Hash64-4 3.966Gi ± 0% 4.183Gi ± 0% +5.47% (p=0.000 n=25) Hash1024-4 10.57Gi ± 0% 10.81Gi ± 0% +2.27% (p=0.000 n=25) Hash65536-4 12.07Gi ± 0% 11.76Gi ± 0% -2.57% (p=0.000 n=25) geomean 3.279Gi 3.367Gi +2.70% These results are obtained with cpu.ARM64.HasAES == false Change-Id: If282267b9b2dcf474516cb33bfdbda4ee35bb8fa --- src/cmd/internal/obj/arm64/asm7.go | 18 ++- src/cmd/internal/obj/arm64/asm_arm64_test.go | 14 ++ src/cmd/internal/obj/arm64/asm_arm64_test.s | 5 + src/cmd/internal/objabi/reloctype.go | 1 + src/cmd/internal/objabi/reloctype_string.go | 131 ++++++++++--------- src/cmd/link/internal/arm64/asm.go | 12 ++ src/runtime/asm_arm64.s | 16 +-- 7 files changed, 119 insertions(+), 78 deletions(-) diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go index 178c83c0bc6..2c905458d69 100644 --- a/src/cmd/internal/obj/arm64/asm7.go +++ b/src/cmd/internal/obj/arm64/asm7.go @@ -2691,6 +2691,11 @@ func cmp(a int, b int) bool { return true } + case C_SBRA: + if b == C_ADDR { + return true + } + case C_LBRA: if b == C_SBRA { return true @@ -4260,7 +4265,18 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { o1 = c.opirr(p, p.As) o1 |= uint32(p.From.Reg & 31) - o1 |= uint32(c.brdist(p, 0, 19, 2) << 5) + + if p.To.Sym == nil { + o1 |= uint32(c.brdist(p, 0, 19, 2) << 5) + break + } + + rel := obj.Addrel(c.cursym) + rel.Off = int32(c.pc) + rel.Siz = 4 + rel.Sym = p.To.Sym + rel.Add = p.To.Offset + rel.Type = objabi.R_AARCH64_CONDBR19 case 40: /* tbz */ o1 = c.opirr(p, p.As) diff --git a/src/cmd/internal/obj/arm64/asm_arm64_test.go b/src/cmd/internal/obj/arm64/asm_arm64_test.go index 83d137a0846..7e743598980 100644 --- a/src/cmd/internal/obj/arm64/asm_arm64_test.go +++ b/src/cmd/internal/obj/arm64/asm_arm64_test.go @@ -38,3 +38,17 @@ func TestMOVK(t *testing.T) { t.Errorf("Got %x want %x\n", x, want) } } + +func testcondbr19() uint64 +func testcondbr19_target() uint64 { + return 42 +} + +// TestCondBr19 makes sure that conditional branch on labels outside of *arm64.s file works fine +func TestCondBr19(t *testing.T) { + x := testcondbr19() + want := uint64(42) + if x != want { + t.Errorf("Got %d want %d\n", x, want) + } +} diff --git a/src/cmd/internal/obj/arm64/asm_arm64_test.s b/src/cmd/internal/obj/arm64/asm_arm64_test.s index e3fda57775f..12cce7e3b1d 100644 --- a/src/cmd/internal/obj/arm64/asm_arm64_test.s +++ b/src/cmd/internal/obj/arm64/asm_arm64_test.s @@ -37,3 +37,8 @@ TEXT ·testmovk(SB), NOSPLIT, $0-8 MOVK $(40000<<48), R0 MOVD R0, ret+0(FP) RET + +// testcondbr19() uint64 +TEXT ·testcondbr19(SB), NOSPLIT, $0-8 + MOVD $0, R0 + CBZ R0, ·testcondbr19_target(SB) diff --git a/src/cmd/internal/objabi/reloctype.go b/src/cmd/internal/objabi/reloctype.go index b7f0124c2a6..1b84fab3b69 100644 --- a/src/cmd/internal/objabi/reloctype.go +++ b/src/cmd/internal/objabi/reloctype.go @@ -131,6 +131,7 @@ const ( // inherently processor specific. // Arm64. + R_AARCH64_CONDBR19 // Set a MOV[NZ] immediate field to bits [15:0] of the offset from the thread // local base to the thread local variable defined by the referenced (thread diff --git a/src/cmd/internal/objabi/reloctype_string.go b/src/cmd/internal/objabi/reloctype_string.go index fd0e401db1e..7f73b12a808 100644 --- a/src/cmd/internal/objabi/reloctype_string.go +++ b/src/cmd/internal/objabi/reloctype_string.go @@ -40,74 +40,75 @@ func _() { _ = x[R_JMPMIPS-30] _ = x[R_DWARFSECREF-31] _ = x[R_DWARFFILEREF-32] - _ = x[R_ARM64_TLS_LE-33] - _ = x[R_ARM64_TLS_IE-34] - _ = x[R_ARM64_GOTPCREL-35] - _ = x[R_ARM64_GOT-36] - _ = x[R_ARM64_PCREL-37] - _ = x[R_ARM64_PCREL_LDST8-38] - _ = x[R_ARM64_PCREL_LDST16-39] - _ = x[R_ARM64_PCREL_LDST32-40] - _ = x[R_ARM64_PCREL_LDST64-41] - _ = x[R_ARM64_LDST8-42] - _ = x[R_ARM64_LDST16-43] - _ = x[R_ARM64_LDST32-44] - _ = x[R_ARM64_LDST64-45] - _ = x[R_ARM64_LDST128-46] - _ = x[R_POWER_TLS_LE-47] - _ = x[R_POWER_TLS_IE-48] - _ = x[R_POWER_TLS-49] - _ = x[R_POWER_TLS_IE_PCREL34-50] - _ = x[R_POWER_TLS_LE_TPREL34-51] - _ = x[R_ADDRPOWER_DS-52] - _ = x[R_ADDRPOWER_GOT-53] - _ = x[R_ADDRPOWER_GOT_PCREL34-54] - _ = x[R_ADDRPOWER_PCREL-55] - _ = x[R_ADDRPOWER_TOCREL-56] - _ = x[R_ADDRPOWER_TOCREL_DS-57] - _ = x[R_ADDRPOWER_D34-58] - _ = x[R_ADDRPOWER_PCREL34-59] - _ = x[R_RISCV_JAL-60] - _ = x[R_RISCV_JAL_TRAMP-61] - _ = x[R_RISCV_CALL-62] - _ = x[R_RISCV_PCREL_ITYPE-63] - _ = x[R_RISCV_PCREL_STYPE-64] - _ = x[R_RISCV_TLS_IE-65] - _ = x[R_RISCV_TLS_LE-66] - _ = x[R_RISCV_GOT_HI20-67] - _ = x[R_RISCV_PCREL_HI20-68] - _ = x[R_RISCV_PCREL_LO12_I-69] - _ = x[R_RISCV_PCREL_LO12_S-70] - _ = x[R_RISCV_BRANCH-71] - _ = x[R_RISCV_RVC_BRANCH-72] - _ = x[R_RISCV_RVC_JUMP-73] - _ = x[R_PCRELDBL-74] - _ = x[R_LOONG64_ADDR_HI-75] - _ = x[R_LOONG64_ADDR_LO-76] - _ = x[R_LOONG64_TLS_LE_HI-77] - _ = x[R_LOONG64_TLS_LE_LO-78] - _ = x[R_CALLLOONG64-79] - _ = x[R_LOONG64_TLS_IE_HI-80] - _ = x[R_LOONG64_TLS_IE_LO-81] - _ = x[R_LOONG64_GOT_HI-82] - _ = x[R_LOONG64_GOT_LO-83] - _ = x[R_LOONG64_ADD64-84] - _ = x[R_LOONG64_SUB64-85] - _ = x[R_JMP16LOONG64-86] - _ = x[R_JMP21LOONG64-87] - _ = x[R_JMPLOONG64-88] - _ = x[R_ADDRMIPSU-89] - _ = x[R_ADDRMIPSTLS-90] - _ = x[R_ADDRCUOFF-91] - _ = x[R_WASMIMPORT-92] - _ = x[R_XCOFFREF-93] - _ = x[R_PEIMAGEOFF-94] - _ = x[R_INITORDER-95] + _ = x[R_AARCH64_CONDBR19-33] + _ = x[R_ARM64_TLS_LE-34] + _ = x[R_ARM64_TLS_IE-35] + _ = x[R_ARM64_GOTPCREL-36] + _ = x[R_ARM64_GOT-37] + _ = x[R_ARM64_PCREL-38] + _ = x[R_ARM64_PCREL_LDST8-39] + _ = x[R_ARM64_PCREL_LDST16-40] + _ = x[R_ARM64_PCREL_LDST32-41] + _ = x[R_ARM64_PCREL_LDST64-42] + _ = x[R_ARM64_LDST8-43] + _ = x[R_ARM64_LDST16-44] + _ = x[R_ARM64_LDST32-45] + _ = x[R_ARM64_LDST64-46] + _ = x[R_ARM64_LDST128-47] + _ = x[R_POWER_TLS_LE-48] + _ = x[R_POWER_TLS_IE-49] + _ = x[R_POWER_TLS-50] + _ = x[R_POWER_TLS_IE_PCREL34-51] + _ = x[R_POWER_TLS_LE_TPREL34-52] + _ = x[R_ADDRPOWER_DS-53] + _ = x[R_ADDRPOWER_GOT-54] + _ = x[R_ADDRPOWER_GOT_PCREL34-55] + _ = x[R_ADDRPOWER_PCREL-56] + _ = x[R_ADDRPOWER_TOCREL-57] + _ = x[R_ADDRPOWER_TOCREL_DS-58] + _ = x[R_ADDRPOWER_D34-59] + _ = x[R_ADDRPOWER_PCREL34-60] + _ = x[R_RISCV_JAL-61] + _ = x[R_RISCV_JAL_TRAMP-62] + _ = x[R_RISCV_CALL-63] + _ = x[R_RISCV_PCREL_ITYPE-64] + _ = x[R_RISCV_PCREL_STYPE-65] + _ = x[R_RISCV_TLS_IE-66] + _ = x[R_RISCV_TLS_LE-67] + _ = x[R_RISCV_GOT_HI20-68] + _ = x[R_RISCV_PCREL_HI20-69] + _ = x[R_RISCV_PCREL_LO12_I-70] + _ = x[R_RISCV_PCREL_LO12_S-71] + _ = x[R_RISCV_BRANCH-72] + _ = x[R_RISCV_RVC_BRANCH-73] + _ = x[R_RISCV_RVC_JUMP-74] + _ = x[R_PCRELDBL-75] + _ = x[R_LOONG64_ADDR_HI-76] + _ = x[R_LOONG64_ADDR_LO-77] + _ = x[R_LOONG64_TLS_LE_HI-78] + _ = x[R_LOONG64_TLS_LE_LO-79] + _ = x[R_CALLLOONG64-80] + _ = x[R_LOONG64_TLS_IE_HI-81] + _ = x[R_LOONG64_TLS_IE_LO-82] + _ = x[R_LOONG64_GOT_HI-83] + _ = x[R_LOONG64_GOT_LO-84] + _ = x[R_LOONG64_ADD64-85] + _ = x[R_LOONG64_SUB64-86] + _ = x[R_JMP16LOONG64-87] + _ = x[R_JMP21LOONG64-88] + _ = x[R_JMPLOONG64-89] + _ = x[R_ADDRMIPSU-90] + _ = x[R_ADDRMIPSTLS-91] + _ = x[R_ADDRCUOFF-92] + _ = x[R_WASMIMPORT-93] + _ = x[R_XCOFFREF-94] + _ = x[R_PEIMAGEOFF-95] + _ = x[R_INITORDER-96] } -const _RelocType_name = "R_ADDRR_ADDRPOWERR_ADDRARM64R_ADDRMIPSR_ADDROFFR_SIZER_CALLR_CALLARMR_CALLARM64R_CALLINDR_CALLPOWERR_CALLMIPSR_CONSTR_PCRELR_TLS_LER_TLS_IER_GOTOFFR_PLT0R_PLT1R_PLT2R_USEFIELDR_USETYPER_USEIFACER_USEIFACEMETHODR_USENAMEDMETHODR_METHODOFFR_KEEPR_POWER_TOCR_GOTPCRELR_JMPMIPSR_DWARFSECREFR_DWARFFILEREFR_ARM64_TLS_LER_ARM64_TLS_IER_ARM64_GOTPCRELR_ARM64_GOTR_ARM64_PCRELR_ARM64_PCREL_LDST8R_ARM64_PCREL_LDST16R_ARM64_PCREL_LDST32R_ARM64_PCREL_LDST64R_ARM64_LDST8R_ARM64_LDST16R_ARM64_LDST32R_ARM64_LDST64R_ARM64_LDST128R_POWER_TLS_LER_POWER_TLS_IER_POWER_TLSR_POWER_TLS_IE_PCREL34R_POWER_TLS_LE_TPREL34R_ADDRPOWER_DSR_ADDRPOWER_GOTR_ADDRPOWER_GOT_PCREL34R_ADDRPOWER_PCRELR_ADDRPOWER_TOCRELR_ADDRPOWER_TOCREL_DSR_ADDRPOWER_D34R_ADDRPOWER_PCREL34R_RISCV_JALR_RISCV_JAL_TRAMPR_RISCV_CALLR_RISCV_PCREL_ITYPER_RISCV_PCREL_STYPER_RISCV_TLS_IER_RISCV_TLS_LER_RISCV_GOT_HI20R_RISCV_PCREL_HI20R_RISCV_PCREL_LO12_IR_RISCV_PCREL_LO12_SR_RISCV_BRANCHR_RISCV_RVC_BRANCHR_RISCV_RVC_JUMPR_PCRELDBLR_LOONG64_ADDR_HIR_LOONG64_ADDR_LOR_LOONG64_TLS_LE_HIR_LOONG64_TLS_LE_LOR_CALLLOONG64R_LOONG64_TLS_IE_HIR_LOONG64_TLS_IE_LOR_LOONG64_GOT_HIR_LOONG64_GOT_LOR_LOONG64_ADD64R_LOONG64_SUB64R_JMP16LOONG64R_JMP21LOONG64R_JMPLOONG64R_ADDRMIPSUR_ADDRMIPSTLSR_ADDRCUOFFR_WASMIMPORTR_XCOFFREFR_PEIMAGEOFFR_INITORDER" +const _RelocType_name = "R_ADDRR_ADDRPOWERR_ADDRARM64R_ADDRMIPSR_ADDROFFR_SIZER_CALLR_CALLARMR_CALLARM64R_CALLINDR_CALLPOWERR_CALLMIPSR_CONSTR_PCRELR_TLS_LER_TLS_IER_GOTOFFR_PLT0R_PLT1R_PLT2R_USEFIELDR_USETYPER_USEIFACER_USEIFACEMETHODR_USENAMEDMETHODR_METHODOFFR_KEEPR_POWER_TOCR_GOTPCRELR_JMPMIPSR_DWARFSECREFR_DWARFFILEREFR_AARCH64_CONDBR19R_ARM64_TLS_LER_ARM64_TLS_IER_ARM64_GOTPCRELR_ARM64_GOTR_ARM64_PCRELR_ARM64_PCREL_LDST8R_ARM64_PCREL_LDST16R_ARM64_PCREL_LDST32R_ARM64_PCREL_LDST64R_ARM64_LDST8R_ARM64_LDST16R_ARM64_LDST32R_ARM64_LDST64R_ARM64_LDST128R_POWER_TLS_LER_POWER_TLS_IER_POWER_TLSR_POWER_TLS_IE_PCREL34R_POWER_TLS_LE_TPREL34R_ADDRPOWER_DSR_ADDRPOWER_GOTR_ADDRPOWER_GOT_PCREL34R_ADDRPOWER_PCRELR_ADDRPOWER_TOCRELR_ADDRPOWER_TOCREL_DSR_ADDRPOWER_D34R_ADDRPOWER_PCREL34R_RISCV_JALR_RISCV_JAL_TRAMPR_RISCV_CALLR_RISCV_PCREL_ITYPER_RISCV_PCREL_STYPER_RISCV_TLS_IER_RISCV_TLS_LER_RISCV_GOT_HI20R_RISCV_PCREL_HI20R_RISCV_PCREL_LO12_IR_RISCV_PCREL_LO12_SR_RISCV_BRANCHR_RISCV_RVC_BRANCHR_RISCV_RVC_JUMPR_PCRELDBLR_LOONG64_ADDR_HIR_LOONG64_ADDR_LOR_LOONG64_TLS_LE_HIR_LOONG64_TLS_LE_LOR_CALLLOONG64R_LOONG64_TLS_IE_HIR_LOONG64_TLS_IE_LOR_LOONG64_GOT_HIR_LOONG64_GOT_LOR_LOONG64_ADD64R_LOONG64_SUB64R_JMP16LOONG64R_JMP21LOONG64R_JMPLOONG64R_ADDRMIPSUR_ADDRMIPSTLSR_ADDRCUOFFR_WASMIMPORTR_XCOFFREFR_PEIMAGEOFFR_INITORDER" -var _RelocType_index = [...]uint16{0, 6, 17, 28, 38, 47, 53, 59, 68, 79, 88, 99, 109, 116, 123, 131, 139, 147, 153, 159, 165, 175, 184, 194, 210, 226, 237, 243, 254, 264, 273, 286, 300, 314, 328, 344, 355, 368, 387, 407, 427, 447, 460, 474, 488, 502, 517, 531, 545, 556, 578, 600, 614, 629, 652, 669, 687, 708, 723, 742, 753, 770, 782, 801, 820, 834, 848, 864, 882, 902, 922, 936, 954, 970, 980, 997, 1014, 1033, 1052, 1065, 1084, 1103, 1119, 1135, 1150, 1165, 1179, 1193, 1205, 1216, 1229, 1240, 1252, 1262, 1274, 1285} +var _RelocType_index = [...]uint16{0, 6, 17, 28, 38, 47, 53, 59, 68, 79, 88, 99, 109, 116, 123, 131, 139, 147, 153, 159, 165, 175, 184, 194, 210, 226, 237, 243, 254, 264, 273, 286, 300, 318, 332, 346, 362, 373, 386, 405, 425, 445, 465, 478, 492, 506, 520, 535, 549, 563, 574, 596, 618, 632, 647, 670, 687, 705, 726, 741, 760, 771, 788, 800, 819, 838, 852, 866, 882, 900, 920, 940, 954, 972, 988, 998, 1015, 1032, 1051, 1070, 1083, 1102, 1121, 1137, 1153, 1168, 1183, 1197, 1211, 1223, 1234, 1247, 1258, 1270, 1280, 1292, 1303} func (i RelocType) String() string { i -= 1 diff --git a/src/cmd/link/internal/arm64/asm.go b/src/cmd/link/internal/arm64/asm.go index 7b85bb3e261..08224913650 100644 --- a/src/cmd/link/internal/arm64/asm.go +++ b/src/cmd/link/internal/arm64/asm.go @@ -968,6 +968,18 @@ func archreloc(target *ld.Target, ldr *loader.Loader, syms *ld.ArchSyms, r loade } return val | ((t >> 2) & 0x03ffffff), noExtReloc, true + case objabi.R_AARCH64_CONDBR19: + var t int64 + if ldr.SymType(rs) == sym.SDYNIMPORT { + t = (ldr.SymAddr(syms.PLT) + r.Add()) - (ldr.SymValue(s) + int64(r.Off())) + } else { + t = (ldr.SymAddr(rs) + r.Add()) - (ldr.SymValue(s) + int64(r.Off())) + } + if t >= 1<<20 || t < -1<<20 { + ldr.Errorf(s, "program too large, call relocation distance = %d", t) + } + return val | (((t >> 2) & 0x7ffff) << 5), noExtReloc, true + case objabi.R_ARM64_GOT: if (val>>24)&0x9f == 0x90 { // R_AARCH64_ADR_GOT_PAGE diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s index 64a18805893..c92858d4f6e 100644 --- a/src/runtime/asm_arm64.s +++ b/src/runtime/asm_arm64.s @@ -532,7 +532,7 @@ CALLFN(·call1073741824, 1073741824) // func memhash32(p unsafe.Pointer, h uintptr) uintptr TEXT runtime·memhash32(SB),NOSPLIT|NOFRAME,$0-24 MOVB runtime·useAeshash(SB), R10 - CBZ R10, noaes + CBZ R10, runtime·memhash32Fallback(SB) MOVD $runtime·aeskeysched+0(SB), R3 VEOR V0.B16, V0.B16, V0.B16 @@ -548,13 +548,11 @@ TEXT runtime·memhash32(SB),NOSPLIT|NOFRAME,$0-24 VMOV V0.D[0], R0 RET -noaes: - B runtime·memhash32Fallback(SB) // func memhash64(p unsafe.Pointer, h uintptr) uintptr TEXT runtime·memhash64(SB),NOSPLIT|NOFRAME,$0-24 MOVB runtime·useAeshash(SB), R10 - CBZ R10, noaes + CBZ R10, runtime·memhash64Fallback(SB) MOVD $runtime·aeskeysched+0(SB), R3 VEOR V0.B16, V0.B16, V0.B16 @@ -570,25 +568,19 @@ TEXT runtime·memhash64(SB),NOSPLIT|NOFRAME,$0-24 VMOV V0.D[0], R0 RET -noaes: - B runtime·memhash64Fallback(SB) // func memhash(p unsafe.Pointer, h, size uintptr) uintptr TEXT runtime·memhash(SB),NOSPLIT|NOFRAME,$0-32 MOVB runtime·useAeshash(SB), R10 - CBZ R10, noaes + CBZ R10, runtime·memhashFallback(SB) B aeshashbody<>(SB) -noaes: - B runtime·memhashFallback(SB) // func strhash(p unsafe.Pointer, h uintptr) uintptr TEXT runtime·strhash(SB),NOSPLIT|NOFRAME,$0-24 MOVB runtime·useAeshash(SB), R10 - CBZ R10, noaes + CBZ R10, runtime·strhashFallback(SB) LDP (R0), (R0, R2) // string data / length B aeshashbody<>(SB) -noaes: - B runtime·strhashFallback(SB) // R0: data // R1: seed data