From 0e23ca41d99c82d301badf1b762888e2c69e6c57 Mon Sep 17 00:00:00 2001 From: Ilya Tocar Date: Wed, 28 Oct 2015 23:20:26 +0300 Subject: [PATCH] bytes: speed up Compare() on amd64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use AVX2 if available. Results (haswell), below: name old time/op new time/op delta BytesCompare1-6 11.4ns ± 0% 11.4ns ± 0% ~ (all samples are equal) BytesCompare2-6 11.4ns ± 0% 11.4ns ± 0% ~ (all samples are equal) BytesCompare4-6 11.4ns ± 0% 11.4ns ± 0% ~ (all samples are equal) BytesCompare8-6 9.29ns ± 2% 8.76ns ± 0% -5.72% (p=0.000 n=16+17) BytesCompare16-6 9.29ns ± 2% 9.20ns ± 0% -1.02% (p=0.000 n=20+16) BytesCompare32-6 11.4ns ± 1% 11.4ns ± 0% ~ (p=0.191 n=20+20) BytesCompare64-6 14.4ns ± 0% 13.1ns ± 0% -8.68% (p=0.000 n=20+20) BytesCompare128-6 20.2ns ± 0% 18.5ns ± 0% -8.27% (p=0.000 n=16+20) BytesCompare256-6 29.3ns ± 0% 24.5ns ± 0% -16.38% (p=0.000 n=16+16) BytesCompare512-6 46.8ns ± 0% 37.1ns ± 0% -20.78% (p=0.000 n=18+16) BytesCompare1024-6 82.9ns ± 0% 62.3ns ± 0% -24.86% (p=0.000 n=20+14) BytesCompare2048-6 155ns ± 0% 112ns ± 0% -27.74% (p=0.000 n=20+20) CompareBytesEqual-6 10.1ns ± 1% 10.0ns ± 1% ~ (p=0.527 n=20+20) CompareBytesToNil-6 10.0ns ± 2% 9.4ns ± 0% -6.57% (p=0.000 n=20+17) CompareBytesEmpty-6 8.76ns ± 0% 8.76ns ± 0% ~ (all samples are equal) CompareBytesIdentical-6 8.76ns ± 0% 8.76ns ± 0% ~ (all samples are equal) CompareBytesSameLength-6 10.6ns ± 1% 10.6ns ± 1% ~ (p=0.240 n=20+20) CompareBytesDifferentLength-6 10.6ns ± 0% 10.6ns ± 1% ~ (p=1.000 n=20+20) CompareBytesBigUnaligned-6 132±s ± 1% 105±s ± 1% -20.61% (p=0.000 n=20+18) CompareBytesBig-6 125±s ± 1% 105±s ± 1% -16.31% (p=0.000 n=20+20) CompareBytesBigIdentical-6 8.13ns ± 0% 8.13ns ± 0% ~ (all samples are equal) name old speed new speed delta CompareBytesBigUnaligned-6 7.94GB/s ± 1% 10.01GB/s ± 1% +25.96% (p=0.000 n=20+18) CompareBytesBig-6 8.38GB/s ± 1% 10.01GB/s ± 1% +19.48% (p=0.000 n=20+20) CompareBytesBigIdentical-6 129TB/s ± 0% 129TB/s ± 0% +0.01% (p=0.003 n=17+19) Change-Id: I820f31bab4582dd4204b146bb077c0d2f24cd8f5 Reviewed-on: https://go-review.googlesource.com/16434 Run-TryBot: Ilya Tocar Reviewed-by: Klaus Post TryBot-Result: Gobot Gobot Reviewed-by: Keith Randall --- src/cmd/internal/obj/x86/a.out.go | 2 + src/cmd/internal/obj/x86/anames.go | 2 + src/cmd/internal/obj/x86/asm6.go | 58 ++++++++++++++++++++++--- src/runtime/asm_amd64.s | 70 +++++++++++++++++++++++++++++- src/runtime/runtime2.go | 2 + 5 files changed, 126 insertions(+), 8 deletions(-) diff --git a/src/cmd/internal/obj/x86/a.out.go b/src/cmd/internal/obj/x86/a.out.go index 108ca6a289e..55fc31ddaf1 100644 --- a/src/cmd/internal/obj/x86/a.out.go +++ b/src/cmd/internal/obj/x86/a.out.go @@ -746,6 +746,8 @@ const ( AMOVHDU AMOVNTHD AMOVHDA + AVPCMPEQB + AVPMOVMSKB // from 386 AJCXZW diff --git a/src/cmd/internal/obj/x86/anames.go b/src/cmd/internal/obj/x86/anames.go index c075a15c80b..729b9d423bc 100644 --- a/src/cmd/internal/obj/x86/anames.go +++ b/src/cmd/internal/obj/x86/anames.go @@ -687,6 +687,8 @@ var Anames = []string{ "MOVHDU", "MOVNTHD", "MOVHDA", + "VPCMPEQB", + "VPMOVMSKB", "JCXZW", "FCMOVCC", "FCMOVCS", diff --git a/src/cmd/internal/obj/x86/asm6.go b/src/cmd/internal/obj/x86/asm6.go index f03df5bf00d..739ba671058 100644 --- a/src/cmd/internal/obj/x86/asm6.go +++ b/src/cmd/internal/obj/x86/asm6.go @@ -195,6 +195,7 @@ const ( Zr_m Zr_m_xm Zr_m_xm_vex + Zr_r_r_vex Zrp_ Z_ib Z_il @@ -630,6 +631,11 @@ var yxr_ml_vex = []ytab{ {Yxr, Ynone, Yml, Zr_m_xm_vex, 1}, } +var yxm_xm_xm = []ytab{ + {Yxr, Yxr, Yxr, Zr_r_r_vex, 1}, + {Yxm, Yxr, Yxr, Zr_r_r_vex, 1}, +} + var ymr = []ytab{ {Ymr, Ynone, Ymr, Zm_r, 1}, } @@ -725,6 +731,10 @@ var ymskb = []ytab{ {Ymr, Ynone, Yrl, Zm_r_xm, 1}, } +var ymskb_vex = []ytab{ + {Yxr, Ynone, Yrl, Zm_r_xm_vex, 2}, +} + var ycrc32l = []ytab{ {Yml, Ynone, Yrl, Zlitm_r, 0}, } @@ -1497,6 +1507,8 @@ var optab = {AMOVHDU, yxmov_vex, Pvex2, [23]uint8{0x6f, 0x7f}}, {AMOVNTHD, yxr_ml_vex, Pvex1, [23]uint8{0xe7}}, {AMOVHDA, yxmov_vex, Pvex1, [23]uint8{0x6f, 0x7f}}, + {AVPCMPEQB, yxm_xm_xm, Pvex1, [23]uint8{0x74, 0x74}}, + {AVPMOVMSKB, ymskb_vex, Pvex1, [23]uint8{0xd7}}, {obj.AUSEFIELD, ynop, Px, [23]uint8{0, 0}}, {obj.ATYPE, nil, 0, [23]uint8{}}, {obj.AFUNCDATA, yfuncdata, Px, [23]uint8{0, 0}}, @@ -2943,11 +2955,15 @@ var bpduff2 = []byte{ 0x48, 0x8b, 0x6d, 0x00, // MOVQ 0(BP), BP } -func vexprefix(ctxt *obj.Link, to *obj.Addr, from *obj.Addr, pref uint8) { +// Assemble vex prefix, from 3 operands and prefix. +// For details about vex prefix see: +// https://en.wikipedia.org/wiki/VEX_prefix#Technical_description +func vexprefix(ctxt *obj.Link, to *obj.Addr, from *obj.Addr, from3 *obj.Addr, pref uint8) { rexR := regrex[to.Reg] rexB := regrex[from.Reg] rexX := regrex[from.Index] var prefBit uint8 + // This will go into VEX.PP field. if pref == Pvex1 { prefBit = 1 } else if pref == Pvex2 { @@ -2955,21 +2971,36 @@ func vexprefix(ctxt *obj.Link, to *obj.Addr, from *obj.Addr, pref uint8) { } // TODO add Pvex0,Pvex3 if rexX == 0 && rexB == 0 { // 2-byte vex prefix + // In 2-byte case, first byte is always C5 ctxt.Andptr[0] = 0xc5 ctxt.Andptr = ctxt.Andptr[1:] - if rexR != 0 { + if from3 == nil { + // If this is a 2-operand instruction fill VEX.VVVV with 1111 + // We are also interested only in 256-bit version, so VEX.L=1 ctxt.Andptr[0] = 0x7c } else { - ctxt.Andptr[0] = 0xfc + // VEX.L=1 + ctxt.Andptr[0] = 0x4 + // VEX.VVVV (bits 3:6) is a inversed register number + ctxt.Andptr[0] |= byte((^(from3.Reg - REG_X0))<<3) & 0x78 + } + + // VEX encodes REX.R as inversed upper bit + if rexR == 0 { + ctxt.Andptr[0] |= 0x80 } ctxt.Andptr[0] |= prefBit ctxt.Andptr = ctxt.Andptr[1:] - } else { + } else { // 3-byte case + // First byte is always C$ ctxt.Andptr[0] = 0xc4 ctxt.Andptr = ctxt.Andptr[1:] + // Encode VEX.mmmmm with prefix value, for now assume 0F 38, + // which encodes as 1. ctxt.Andptr[0] = 0x1 // TODO handle different prefix + // REX.[RXB] are inverted and encoded in 3 upper bits if rexR == 0 { ctxt.Andptr[0] |= 0x80 } @@ -2981,7 +3012,13 @@ func vexprefix(ctxt *obj.Link, to *obj.Addr, from *obj.Addr, pref uint8) { } ctxt.Andptr = ctxt.Andptr[1:] - ctxt.Andptr[0] = 0x7c + // Fill VEX.VVVV, same as 2-operand VEX instruction. + if from3 == nil { + ctxt.Andptr[0] = 0x7c + } else { + ctxt.Andptr[0] = 0x4 + ctxt.Andptr[0] |= byte((^(from3.Reg - REG_X0))<<3) & 0x78 + } ctxt.Andptr[0] |= prefBit ctxt.Andptr = ctxt.Andptr[1:] } @@ -3222,7 +3259,7 @@ func doasm(ctxt *obj.Link, p *obj.Prog) { case Zm_r_xm_vex: ctxt.Vexflag = 1 - vexprefix(ctxt, &p.To, &p.From, o.prefix) + vexprefix(ctxt, &p.To, &p.From, nil, o.prefix) ctxt.Andptr[0] = byte(op) ctxt.Andptr = ctxt.Andptr[1:] asmand(ctxt, p, &p.From, &p.To) @@ -3284,11 +3321,18 @@ func doasm(ctxt *obj.Link, p *obj.Prog) { case Zr_m_xm_vex: ctxt.Vexflag = 1 - vexprefix(ctxt, &p.From, &p.To, o.prefix) + vexprefix(ctxt, &p.From, &p.To, nil, o.prefix) ctxt.Andptr[0] = byte(op) ctxt.Andptr = ctxt.Andptr[1:] asmand(ctxt, p, &p.To, &p.From) + case Zr_r_r_vex: + ctxt.Vexflag = 1 + vexprefix(ctxt, &p.To, &p.From, p.From3, o.prefix) + ctxt.Andptr[0] = byte(op) + ctxt.Andptr = ctxt.Andptr[1:] + asmand(ctxt, p, &p.From, &p.To) + case Zr_m_xm: mediaop(ctxt, o, op, int(yt.zoffset), z) asmand(ctxt, p, &p.To, &p.From) diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s index 454789c5098..33d641e6122 100644 --- a/src/runtime/asm_amd64.s +++ b/src/runtime/asm_amd64.s @@ -42,11 +42,37 @@ TEXT runtime·rt0_go(SB),NOSPLIT,$0 JNE notintel MOVB $1, runtime·lfenceBeforeRdtsc(SB) notintel: + // Do nothing. MOVQ $1, AX CPUID MOVL CX, runtime·cpuid_ecx(SB) MOVL DX, runtime·cpuid_edx(SB) + // Detect AVX and AVX2 as per 14.7.1 Detection of AVX2 chapter of [1] + // [1] 64-ia-32-architectures-software-developer-manual-325462.pdf + // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf + ANDL $0x18000000, CX // check for OSXSAVE and AVX bits + CMPL CX, $0x18000000 + JNE noavx + MOVL $0, CX + // For XGETBV, OSXSAVE bit is required and sufficient + BYTE $0x0F; BYTE $0x01; BYTE $0xD0 + ANDL $6, AX + CMPL AX, $6 // Check for OS support of YMM registers + JNE noavx + MOVB $1, runtime·support_avx(SB) + MOVL $7, AX + MOVL $0, CX + CPUID + ANDL $0x20, BX // check for AVX2 bit + CMPL BX, $0x20 + JNE noavx2 + MOVB $1, runtime·support_avx2(SB) + JMP nocpuinfo +noavx: + MOVB $0, runtime·support_avx(SB) +noavx2: + MOVB $0, runtime·support_avx2(SB) nocpuinfo: // if there is an _cgo_init, call it. @@ -1508,7 +1534,10 @@ TEXT runtime·cmpbody(SB),NOSPLIT,$0-0 JB small CMPQ R8, $63 - JA big_loop + JBE loop + CMPB runtime·support_avx2(SB), $1 + JEQ big_loop_avx2 + JMP big_loop loop: CMPQ R8, $16 JBE _0through16 @@ -1657,6 +1686,45 @@ big_loop: JBE loop JMP big_loop + // Compare 64-bytes per loop iteration. + // Loop is unrolled and uses AVX2. +big_loop_avx2: + MOVHDU (SI), X2 + MOVHDU (DI), X3 + MOVHDU 32(SI), X4 + MOVHDU 32(DI), X5 + VPCMPEQB X2, X3, X0 + VPMOVMSKB X0, AX + XORL $0xffffffff, AX + JNE diff32_avx2 + VPCMPEQB X4, X5, X6 + VPMOVMSKB X6, AX + XORL $0xffffffff, AX + JNE diff64_avx2 + + ADDQ $64, SI + ADDQ $64, DI + SUBQ $64, R8 + CMPQ R8, $64 + JB big_loop_avx2_exit + JMP big_loop_avx2 + + // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk. +diff32_avx2: + VZEROUPPER + JMP diff16 + + // Same as diff32_avx2, but for last 32 bytes. +diff64_avx2: + VZEROUPPER + JMP diff48 + + // For <64 bytes remainder jump to normal loop. +big_loop_avx2_exit: + VZEROUPPER + JMP loop + + TEXT bytes·IndexByte(SB),NOSPLIT,$0-40 MOVQ s+0(FP), SI MOVQ s_len+8(FP), BX diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go index 6b61cd62fa1..f1337e570ec 100644 --- a/src/runtime/runtime2.go +++ b/src/runtime/runtime2.go @@ -627,6 +627,8 @@ var ( cpuid_ecx uint32 cpuid_edx uint32 lfenceBeforeRdtsc bool + support_avx bool + support_avx2 bool goarm uint8 // set by cmd/link on arm systems )