mirror of
https://github.com/golang/go
synced 2024-11-23 23:40:13 -07:00
bytes: speed up Compare() on amd64
Use AVX2 if available. Results (haswell), below: name old time/op new time/op delta BytesCompare1-6 11.4ns ± 0% 11.4ns ± 0% ~ (all samples are equal) BytesCompare2-6 11.4ns ± 0% 11.4ns ± 0% ~ (all samples are equal) BytesCompare4-6 11.4ns ± 0% 11.4ns ± 0% ~ (all samples are equal) BytesCompare8-6 9.29ns ± 2% 8.76ns ± 0% -5.72% (p=0.000 n=16+17) BytesCompare16-6 9.29ns ± 2% 9.20ns ± 0% -1.02% (p=0.000 n=20+16) BytesCompare32-6 11.4ns ± 1% 11.4ns ± 0% ~ (p=0.191 n=20+20) BytesCompare64-6 14.4ns ± 0% 13.1ns ± 0% -8.68% (p=0.000 n=20+20) BytesCompare128-6 20.2ns ± 0% 18.5ns ± 0% -8.27% (p=0.000 n=16+20) BytesCompare256-6 29.3ns ± 0% 24.5ns ± 0% -16.38% (p=0.000 n=16+16) BytesCompare512-6 46.8ns ± 0% 37.1ns ± 0% -20.78% (p=0.000 n=18+16) BytesCompare1024-6 82.9ns ± 0% 62.3ns ± 0% -24.86% (p=0.000 n=20+14) BytesCompare2048-6 155ns ± 0% 112ns ± 0% -27.74% (p=0.000 n=20+20) CompareBytesEqual-6 10.1ns ± 1% 10.0ns ± 1% ~ (p=0.527 n=20+20) CompareBytesToNil-6 10.0ns ± 2% 9.4ns ± 0% -6.57% (p=0.000 n=20+17) CompareBytesEmpty-6 8.76ns ± 0% 8.76ns ± 0% ~ (all samples are equal) CompareBytesIdentical-6 8.76ns ± 0% 8.76ns ± 0% ~ (all samples are equal) CompareBytesSameLength-6 10.6ns ± 1% 10.6ns ± 1% ~ (p=0.240 n=20+20) CompareBytesDifferentLength-6 10.6ns ± 0% 10.6ns ± 1% ~ (p=1.000 n=20+20) CompareBytesBigUnaligned-6 132±s ± 1% 105±s ± 1% -20.61% (p=0.000 n=20+18) CompareBytesBig-6 125±s ± 1% 105±s ± 1% -16.31% (p=0.000 n=20+20) CompareBytesBigIdentical-6 8.13ns ± 0% 8.13ns ± 0% ~ (all samples are equal) name old speed new speed delta CompareBytesBigUnaligned-6 7.94GB/s ± 1% 10.01GB/s ± 1% +25.96% (p=0.000 n=20+18) CompareBytesBig-6 8.38GB/s ± 1% 10.01GB/s ± 1% +19.48% (p=0.000 n=20+20) CompareBytesBigIdentical-6 129TB/s ± 0% 129TB/s ± 0% +0.01% (p=0.003 n=17+19) Change-Id: I820f31bab4582dd4204b146bb077c0d2f24cd8f5 Reviewed-on: https://go-review.googlesource.com/16434 Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> Reviewed-by: Klaus Post <klauspost@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
parent
cf73357e37
commit
0e23ca41d9
@ -746,6 +746,8 @@ const (
|
||||
AMOVHDU
|
||||
AMOVNTHD
|
||||
AMOVHDA
|
||||
AVPCMPEQB
|
||||
AVPMOVMSKB
|
||||
|
||||
// from 386
|
||||
AJCXZW
|
||||
|
@ -687,6 +687,8 @@ var Anames = []string{
|
||||
"MOVHDU",
|
||||
"MOVNTHD",
|
||||
"MOVHDA",
|
||||
"VPCMPEQB",
|
||||
"VPMOVMSKB",
|
||||
"JCXZW",
|
||||
"FCMOVCC",
|
||||
"FCMOVCS",
|
||||
|
@ -195,6 +195,7 @@ const (
|
||||
Zr_m
|
||||
Zr_m_xm
|
||||
Zr_m_xm_vex
|
||||
Zr_r_r_vex
|
||||
Zrp_
|
||||
Z_ib
|
||||
Z_il
|
||||
@ -630,6 +631,11 @@ var yxr_ml_vex = []ytab{
|
||||
{Yxr, Ynone, Yml, Zr_m_xm_vex, 1},
|
||||
}
|
||||
|
||||
var yxm_xm_xm = []ytab{
|
||||
{Yxr, Yxr, Yxr, Zr_r_r_vex, 1},
|
||||
{Yxm, Yxr, Yxr, Zr_r_r_vex, 1},
|
||||
}
|
||||
|
||||
var ymr = []ytab{
|
||||
{Ymr, Ynone, Ymr, Zm_r, 1},
|
||||
}
|
||||
@ -725,6 +731,10 @@ var ymskb = []ytab{
|
||||
{Ymr, Ynone, Yrl, Zm_r_xm, 1},
|
||||
}
|
||||
|
||||
var ymskb_vex = []ytab{
|
||||
{Yxr, Ynone, Yrl, Zm_r_xm_vex, 2},
|
||||
}
|
||||
|
||||
var ycrc32l = []ytab{
|
||||
{Yml, Ynone, Yrl, Zlitm_r, 0},
|
||||
}
|
||||
@ -1497,6 +1507,8 @@ var optab =
|
||||
{AMOVHDU, yxmov_vex, Pvex2, [23]uint8{0x6f, 0x7f}},
|
||||
{AMOVNTHD, yxr_ml_vex, Pvex1, [23]uint8{0xe7}},
|
||||
{AMOVHDA, yxmov_vex, Pvex1, [23]uint8{0x6f, 0x7f}},
|
||||
{AVPCMPEQB, yxm_xm_xm, Pvex1, [23]uint8{0x74, 0x74}},
|
||||
{AVPMOVMSKB, ymskb_vex, Pvex1, [23]uint8{0xd7}},
|
||||
{obj.AUSEFIELD, ynop, Px, [23]uint8{0, 0}},
|
||||
{obj.ATYPE, nil, 0, [23]uint8{}},
|
||||
{obj.AFUNCDATA, yfuncdata, Px, [23]uint8{0, 0}},
|
||||
@ -2943,11 +2955,15 @@ var bpduff2 = []byte{
|
||||
0x48, 0x8b, 0x6d, 0x00, // MOVQ 0(BP), BP
|
||||
}
|
||||
|
||||
func vexprefix(ctxt *obj.Link, to *obj.Addr, from *obj.Addr, pref uint8) {
|
||||
// Assemble vex prefix, from 3 operands and prefix.
|
||||
// For details about vex prefix see:
|
||||
// https://en.wikipedia.org/wiki/VEX_prefix#Technical_description
|
||||
func vexprefix(ctxt *obj.Link, to *obj.Addr, from *obj.Addr, from3 *obj.Addr, pref uint8) {
|
||||
rexR := regrex[to.Reg]
|
||||
rexB := regrex[from.Reg]
|
||||
rexX := regrex[from.Index]
|
||||
var prefBit uint8
|
||||
// This will go into VEX.PP field.
|
||||
if pref == Pvex1 {
|
||||
prefBit = 1
|
||||
} else if pref == Pvex2 {
|
||||
@ -2955,21 +2971,36 @@ func vexprefix(ctxt *obj.Link, to *obj.Addr, from *obj.Addr, pref uint8) {
|
||||
} // TODO add Pvex0,Pvex3
|
||||
|
||||
if rexX == 0 && rexB == 0 { // 2-byte vex prefix
|
||||
// In 2-byte case, first byte is always C5
|
||||
ctxt.Andptr[0] = 0xc5
|
||||
ctxt.Andptr = ctxt.Andptr[1:]
|
||||
|
||||
if rexR != 0 {
|
||||
if from3 == nil {
|
||||
// If this is a 2-operand instruction fill VEX.VVVV with 1111
|
||||
// We are also interested only in 256-bit version, so VEX.L=1
|
||||
ctxt.Andptr[0] = 0x7c
|
||||
} else {
|
||||
ctxt.Andptr[0] = 0xfc
|
||||
// VEX.L=1
|
||||
ctxt.Andptr[0] = 0x4
|
||||
// VEX.VVVV (bits 3:6) is a inversed register number
|
||||
ctxt.Andptr[0] |= byte((^(from3.Reg - REG_X0))<<3) & 0x78
|
||||
}
|
||||
|
||||
// VEX encodes REX.R as inversed upper bit
|
||||
if rexR == 0 {
|
||||
ctxt.Andptr[0] |= 0x80
|
||||
}
|
||||
ctxt.Andptr[0] |= prefBit
|
||||
ctxt.Andptr = ctxt.Andptr[1:]
|
||||
} else {
|
||||
} else { // 3-byte case
|
||||
// First byte is always C$
|
||||
ctxt.Andptr[0] = 0xc4
|
||||
ctxt.Andptr = ctxt.Andptr[1:]
|
||||
|
||||
// Encode VEX.mmmmm with prefix value, for now assume 0F 38,
|
||||
// which encodes as 1.
|
||||
ctxt.Andptr[0] = 0x1 // TODO handle different prefix
|
||||
// REX.[RXB] are inverted and encoded in 3 upper bits
|
||||
if rexR == 0 {
|
||||
ctxt.Andptr[0] |= 0x80
|
||||
}
|
||||
@ -2981,7 +3012,13 @@ func vexprefix(ctxt *obj.Link, to *obj.Addr, from *obj.Addr, pref uint8) {
|
||||
}
|
||||
ctxt.Andptr = ctxt.Andptr[1:]
|
||||
|
||||
ctxt.Andptr[0] = 0x7c
|
||||
// Fill VEX.VVVV, same as 2-operand VEX instruction.
|
||||
if from3 == nil {
|
||||
ctxt.Andptr[0] = 0x7c
|
||||
} else {
|
||||
ctxt.Andptr[0] = 0x4
|
||||
ctxt.Andptr[0] |= byte((^(from3.Reg - REG_X0))<<3) & 0x78
|
||||
}
|
||||
ctxt.Andptr[0] |= prefBit
|
||||
ctxt.Andptr = ctxt.Andptr[1:]
|
||||
}
|
||||
@ -3222,7 +3259,7 @@ func doasm(ctxt *obj.Link, p *obj.Prog) {
|
||||
|
||||
case Zm_r_xm_vex:
|
||||
ctxt.Vexflag = 1
|
||||
vexprefix(ctxt, &p.To, &p.From, o.prefix)
|
||||
vexprefix(ctxt, &p.To, &p.From, nil, o.prefix)
|
||||
ctxt.Andptr[0] = byte(op)
|
||||
ctxt.Andptr = ctxt.Andptr[1:]
|
||||
asmand(ctxt, p, &p.From, &p.To)
|
||||
@ -3284,11 +3321,18 @@ func doasm(ctxt *obj.Link, p *obj.Prog) {
|
||||
|
||||
case Zr_m_xm_vex:
|
||||
ctxt.Vexflag = 1
|
||||
vexprefix(ctxt, &p.From, &p.To, o.prefix)
|
||||
vexprefix(ctxt, &p.From, &p.To, nil, o.prefix)
|
||||
ctxt.Andptr[0] = byte(op)
|
||||
ctxt.Andptr = ctxt.Andptr[1:]
|
||||
asmand(ctxt, p, &p.To, &p.From)
|
||||
|
||||
case Zr_r_r_vex:
|
||||
ctxt.Vexflag = 1
|
||||
vexprefix(ctxt, &p.To, &p.From, p.From3, o.prefix)
|
||||
ctxt.Andptr[0] = byte(op)
|
||||
ctxt.Andptr = ctxt.Andptr[1:]
|
||||
asmand(ctxt, p, &p.From, &p.To)
|
||||
|
||||
case Zr_m_xm:
|
||||
mediaop(ctxt, o, op, int(yt.zoffset), z)
|
||||
asmand(ctxt, p, &p.To, &p.From)
|
||||
|
@ -42,11 +42,37 @@ TEXT runtime·rt0_go(SB),NOSPLIT,$0
|
||||
JNE notintel
|
||||
MOVB $1, runtime·lfenceBeforeRdtsc(SB)
|
||||
notintel:
|
||||
// Do nothing.
|
||||
|
||||
MOVQ $1, AX
|
||||
CPUID
|
||||
MOVL CX, runtime·cpuid_ecx(SB)
|
||||
MOVL DX, runtime·cpuid_edx(SB)
|
||||
// Detect AVX and AVX2 as per 14.7.1 Detection of AVX2 chapter of [1]
|
||||
// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
|
||||
// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
|
||||
ANDL $0x18000000, CX // check for OSXSAVE and AVX bits
|
||||
CMPL CX, $0x18000000
|
||||
JNE noavx
|
||||
MOVL $0, CX
|
||||
// For XGETBV, OSXSAVE bit is required and sufficient
|
||||
BYTE $0x0F; BYTE $0x01; BYTE $0xD0
|
||||
ANDL $6, AX
|
||||
CMPL AX, $6 // Check for OS support of YMM registers
|
||||
JNE noavx
|
||||
MOVB $1, runtime·support_avx(SB)
|
||||
MOVL $7, AX
|
||||
MOVL $0, CX
|
||||
CPUID
|
||||
ANDL $0x20, BX // check for AVX2 bit
|
||||
CMPL BX, $0x20
|
||||
JNE noavx2
|
||||
MOVB $1, runtime·support_avx2(SB)
|
||||
JMP nocpuinfo
|
||||
noavx:
|
||||
MOVB $0, runtime·support_avx(SB)
|
||||
noavx2:
|
||||
MOVB $0, runtime·support_avx2(SB)
|
||||
nocpuinfo:
|
||||
|
||||
// if there is an _cgo_init, call it.
|
||||
@ -1508,7 +1534,10 @@ TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
|
||||
JB small
|
||||
|
||||
CMPQ R8, $63
|
||||
JA big_loop
|
||||
JBE loop
|
||||
CMPB runtime·support_avx2(SB), $1
|
||||
JEQ big_loop_avx2
|
||||
JMP big_loop
|
||||
loop:
|
||||
CMPQ R8, $16
|
||||
JBE _0through16
|
||||
@ -1657,6 +1686,45 @@ big_loop:
|
||||
JBE loop
|
||||
JMP big_loop
|
||||
|
||||
// Compare 64-bytes per loop iteration.
|
||||
// Loop is unrolled and uses AVX2.
|
||||
big_loop_avx2:
|
||||
MOVHDU (SI), X2
|
||||
MOVHDU (DI), X3
|
||||
MOVHDU 32(SI), X4
|
||||
MOVHDU 32(DI), X5
|
||||
VPCMPEQB X2, X3, X0
|
||||
VPMOVMSKB X0, AX
|
||||
XORL $0xffffffff, AX
|
||||
JNE diff32_avx2
|
||||
VPCMPEQB X4, X5, X6
|
||||
VPMOVMSKB X6, AX
|
||||
XORL $0xffffffff, AX
|
||||
JNE diff64_avx2
|
||||
|
||||
ADDQ $64, SI
|
||||
ADDQ $64, DI
|
||||
SUBQ $64, R8
|
||||
CMPQ R8, $64
|
||||
JB big_loop_avx2_exit
|
||||
JMP big_loop_avx2
|
||||
|
||||
// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
|
||||
diff32_avx2:
|
||||
VZEROUPPER
|
||||
JMP diff16
|
||||
|
||||
// Same as diff32_avx2, but for last 32 bytes.
|
||||
diff64_avx2:
|
||||
VZEROUPPER
|
||||
JMP diff48
|
||||
|
||||
// For <64 bytes remainder jump to normal loop.
|
||||
big_loop_avx2_exit:
|
||||
VZEROUPPER
|
||||
JMP loop
|
||||
|
||||
|
||||
TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
|
||||
MOVQ s+0(FP), SI
|
||||
MOVQ s_len+8(FP), BX
|
||||
|
@ -627,6 +627,8 @@ var (
|
||||
cpuid_ecx uint32
|
||||
cpuid_edx uint32
|
||||
lfenceBeforeRdtsc bool
|
||||
support_avx bool
|
||||
support_avx2 bool
|
||||
|
||||
goarm uint8 // set by cmd/link on arm systems
|
||||
)
|
||||
|
Loading…
Reference in New Issue
Block a user