From 0e23ca41d99c82d301badf1b762888e2c69e6c57 Mon Sep 17 00:00:00 2001
From: Ilya Tocar <ilya.tocar@intel.com>
Date: Wed, 28 Oct 2015 23:20:26 +0300
Subject: [PATCH] bytes: speed up Compare() on amd64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use AVX2 if available.
Results (haswell), below:

name                           old time/op    new time/op     delta
BytesCompare1-6                  11.4ns ± 0%     11.4ns ± 0%     ~     (all samples are equal)
BytesCompare2-6                  11.4ns ± 0%     11.4ns ± 0%     ~     (all samples are equal)
BytesCompare4-6                  11.4ns ± 0%     11.4ns ± 0%     ~     (all samples are equal)
BytesCompare8-6                  9.29ns ± 2%     8.76ns ± 0%   -5.72%        (p=0.000 n=16+17)
BytesCompare16-6                 9.29ns ± 2%     9.20ns ± 0%   -1.02%        (p=0.000 n=20+16)
BytesCompare32-6                 11.4ns ± 1%     11.4ns ± 0%     ~           (p=0.191 n=20+20)
BytesCompare64-6                 14.4ns ± 0%     13.1ns ± 0%   -8.68%        (p=0.000 n=20+20)
BytesCompare128-6                20.2ns ± 0%     18.5ns ± 0%   -8.27%        (p=0.000 n=16+20)
BytesCompare256-6                29.3ns ± 0%     24.5ns ± 0%  -16.38%        (p=0.000 n=16+16)
BytesCompare512-6                46.8ns ± 0%     37.1ns ± 0%  -20.78%        (p=0.000 n=18+16)
BytesCompare1024-6               82.9ns ± 0%     62.3ns ± 0%  -24.86%        (p=0.000 n=20+14)
BytesCompare2048-6                155ns ± 0%      112ns ± 0%  -27.74%        (p=0.000 n=20+20)
CompareBytesEqual-6              10.1ns ± 1%     10.0ns ± 1%     ~           (p=0.527 n=20+20)
CompareBytesToNil-6              10.0ns ± 2%      9.4ns ± 0%   -6.57%        (p=0.000 n=20+17)
CompareBytesEmpty-6              8.76ns ± 0%     8.76ns ± 0%     ~     (all samples are equal)
CompareBytesIdentical-6          8.76ns ± 0%     8.76ns ± 0%     ~     (all samples are equal)
CompareBytesSameLength-6         10.6ns ± 1%     10.6ns ± 1%     ~           (p=0.240 n=20+20)
CompareBytesDifferentLength-6    10.6ns ± 0%     10.6ns ± 1%     ~           (p=1.000 n=20+20)
CompareBytesBigUnaligned-6        132±s ± 1%      105±s ± 1%  -20.61%        (p=0.000 n=20+18)
CompareBytesBig-6                 125±s ± 1%      105±s ± 1%  -16.31%        (p=0.000 n=20+20)
CompareBytesBigIdentical-6       8.13ns ± 0%     8.13ns ± 0%     ~     (all samples are equal)

name                           old speed      new speed       delta
CompareBytesBigUnaligned-6     7.94GB/s ± 1%  10.01GB/s ± 1%  +25.96%        (p=0.000 n=20+18)
CompareBytesBig-6              8.38GB/s ± 1%  10.01GB/s ± 1%  +19.48%        (p=0.000 n=20+20)
CompareBytesBigIdentical-6      129TB/s ± 0%    129TB/s ± 0%   +0.01%        (p=0.003 n=17+19)

Change-Id: I820f31bab4582dd4204b146bb077c0d2f24cd8f5
Reviewed-on: https://go-review.googlesource.com/16434
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
Reviewed-by: Klaus Post <klauspost@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
---
 src/cmd/internal/obj/x86/a.out.go  |  2 +
 src/cmd/internal/obj/x86/anames.go |  2 +
 src/cmd/internal/obj/x86/asm6.go   | 58 ++++++++++++++++++++++---
 src/runtime/asm_amd64.s            | 70 +++++++++++++++++++++++++++++-
 src/runtime/runtime2.go            |  2 +
 5 files changed, 126 insertions(+), 8 deletions(-)

diff --git a/src/cmd/internal/obj/x86/a.out.go b/src/cmd/internal/obj/x86/a.out.go
index 108ca6a289e..55fc31ddaf1 100644
--- a/src/cmd/internal/obj/x86/a.out.go
+++ b/src/cmd/internal/obj/x86/a.out.go
@@ -746,6 +746,8 @@ const (
 	AMOVHDU
 	AMOVNTHD
 	AMOVHDA
+	AVPCMPEQB
+	AVPMOVMSKB
 
 	// from 386
 	AJCXZW
diff --git a/src/cmd/internal/obj/x86/anames.go b/src/cmd/internal/obj/x86/anames.go
index c075a15c80b..729b9d423bc 100644
--- a/src/cmd/internal/obj/x86/anames.go
+++ b/src/cmd/internal/obj/x86/anames.go
@@ -687,6 +687,8 @@ var Anames = []string{
 	"MOVHDU",
 	"MOVNTHD",
 	"MOVHDA",
+	"VPCMPEQB",
+	"VPMOVMSKB",
 	"JCXZW",
 	"FCMOVCC",
 	"FCMOVCS",
diff --git a/src/cmd/internal/obj/x86/asm6.go b/src/cmd/internal/obj/x86/asm6.go
index f03df5bf00d..739ba671058 100644
--- a/src/cmd/internal/obj/x86/asm6.go
+++ b/src/cmd/internal/obj/x86/asm6.go
@@ -195,6 +195,7 @@ const (
 	Zr_m
 	Zr_m_xm
 	Zr_m_xm_vex
+	Zr_r_r_vex
 	Zrp_
 	Z_ib
 	Z_il
@@ -630,6 +631,11 @@ var yxr_ml_vex = []ytab{
 	{Yxr, Ynone, Yml, Zr_m_xm_vex, 1},
 }
 
+var yxm_xm_xm = []ytab{
+	{Yxr, Yxr, Yxr, Zr_r_r_vex, 1},
+	{Yxm, Yxr, Yxr, Zr_r_r_vex, 1},
+}
+
 var ymr = []ytab{
 	{Ymr, Ynone, Ymr, Zm_r, 1},
 }
@@ -725,6 +731,10 @@ var ymskb = []ytab{
 	{Ymr, Ynone, Yrl, Zm_r_xm, 1},
 }
 
+var ymskb_vex = []ytab{
+	{Yxr, Ynone, Yrl, Zm_r_xm_vex, 2},
+}
+
 var ycrc32l = []ytab{
 	{Yml, Ynone, Yrl, Zlitm_r, 0},
 }
@@ -1497,6 +1507,8 @@ var optab =
 	{AMOVHDU, yxmov_vex, Pvex2, [23]uint8{0x6f, 0x7f}},
 	{AMOVNTHD, yxr_ml_vex, Pvex1, [23]uint8{0xe7}},
 	{AMOVHDA, yxmov_vex, Pvex1, [23]uint8{0x6f, 0x7f}},
+	{AVPCMPEQB, yxm_xm_xm, Pvex1, [23]uint8{0x74, 0x74}},
+	{AVPMOVMSKB, ymskb_vex, Pvex1, [23]uint8{0xd7}},
 	{obj.AUSEFIELD, ynop, Px, [23]uint8{0, 0}},
 	{obj.ATYPE, nil, 0, [23]uint8{}},
 	{obj.AFUNCDATA, yfuncdata, Px, [23]uint8{0, 0}},
@@ -2943,11 +2955,15 @@ var bpduff2 = []byte{
 	0x48, 0x8b, 0x6d, 0x00, // MOVQ 0(BP), BP
 }
 
-func vexprefix(ctxt *obj.Link, to *obj.Addr, from *obj.Addr, pref uint8) {
+// Assemble vex prefix, from 3 operands and prefix.
+// For details about vex prefix see:
+// https://en.wikipedia.org/wiki/VEX_prefix#Technical_description
+func vexprefix(ctxt *obj.Link, to *obj.Addr, from *obj.Addr, from3 *obj.Addr, pref uint8) {
 	rexR := regrex[to.Reg]
 	rexB := regrex[from.Reg]
 	rexX := regrex[from.Index]
 	var prefBit uint8
+	// This will go into VEX.PP field.
 	if pref == Pvex1 {
 		prefBit = 1
 	} else if pref == Pvex2 {
@@ -2955,21 +2971,36 @@ func vexprefix(ctxt *obj.Link, to *obj.Addr, from *obj.Addr, pref uint8) {
 	} // TODO add Pvex0,Pvex3
 
 	if rexX == 0 && rexB == 0 { // 2-byte vex prefix
+		// In 2-byte case, first byte is always C5
 		ctxt.Andptr[0] = 0xc5
 		ctxt.Andptr = ctxt.Andptr[1:]
 
-		if rexR != 0 {
+		if from3 == nil {
+			// If this is a 2-operand instruction fill VEX.VVVV with 1111
+			// We are also interested only in 256-bit version, so VEX.L=1
 			ctxt.Andptr[0] = 0x7c
 		} else {
-			ctxt.Andptr[0] = 0xfc
+			// VEX.L=1
+			ctxt.Andptr[0] = 0x4
+			// VEX.VVVV (bits 3:6) is a inversed register number
+			ctxt.Andptr[0] |= byte((^(from3.Reg - REG_X0))<<3) & 0x78
+		}
+
+		// VEX encodes REX.R as inversed upper bit
+		if rexR == 0 {
+			ctxt.Andptr[0] |= 0x80
 		}
 		ctxt.Andptr[0] |= prefBit
 		ctxt.Andptr = ctxt.Andptr[1:]
-	} else {
+	} else { // 3-byte case
+		// First byte is always C$
 		ctxt.Andptr[0] = 0xc4
 		ctxt.Andptr = ctxt.Andptr[1:]
 
+		// Encode VEX.mmmmm with prefix value, for now assume 0F 38,
+		// which encodes as 1.
 		ctxt.Andptr[0] = 0x1 // TODO handle different prefix
+		// REX.[RXB] are inverted and encoded in 3 upper bits
 		if rexR == 0 {
 			ctxt.Andptr[0] |= 0x80
 		}
@@ -2981,7 +3012,13 @@ func vexprefix(ctxt *obj.Link, to *obj.Addr, from *obj.Addr, pref uint8) {
 		}
 		ctxt.Andptr = ctxt.Andptr[1:]
 
-		ctxt.Andptr[0] = 0x7c
+		// Fill VEX.VVVV, same as 2-operand VEX instruction.
+		if from3 == nil {
+			ctxt.Andptr[0] = 0x7c
+		} else {
+			ctxt.Andptr[0] = 0x4
+			ctxt.Andptr[0] |= byte((^(from3.Reg - REG_X0))<<3) & 0x78
+		}
 		ctxt.Andptr[0] |= prefBit
 		ctxt.Andptr = ctxt.Andptr[1:]
 	}
@@ -3222,7 +3259,7 @@ func doasm(ctxt *obj.Link, p *obj.Prog) {
 
 			case Zm_r_xm_vex:
 				ctxt.Vexflag = 1
-				vexprefix(ctxt, &p.To, &p.From, o.prefix)
+				vexprefix(ctxt, &p.To, &p.From, nil, o.prefix)
 				ctxt.Andptr[0] = byte(op)
 				ctxt.Andptr = ctxt.Andptr[1:]
 				asmand(ctxt, p, &p.From, &p.To)
@@ -3284,11 +3321,18 @@ func doasm(ctxt *obj.Link, p *obj.Prog) {
 
 			case Zr_m_xm_vex:
 				ctxt.Vexflag = 1
-				vexprefix(ctxt, &p.From, &p.To, o.prefix)
+				vexprefix(ctxt, &p.From, &p.To, nil, o.prefix)
 				ctxt.Andptr[0] = byte(op)
 				ctxt.Andptr = ctxt.Andptr[1:]
 				asmand(ctxt, p, &p.To, &p.From)
 
+			case Zr_r_r_vex:
+				ctxt.Vexflag = 1
+				vexprefix(ctxt, &p.To, &p.From, p.From3, o.prefix)
+				ctxt.Andptr[0] = byte(op)
+				ctxt.Andptr = ctxt.Andptr[1:]
+				asmand(ctxt, p, &p.From, &p.To)
+
 			case Zr_m_xm:
 				mediaop(ctxt, o, op, int(yt.zoffset), z)
 				asmand(ctxt, p, &p.To, &p.From)
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index 454789c5098..33d641e6122 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -42,11 +42,37 @@ TEXT runtime·rt0_go(SB),NOSPLIT,$0
 	JNE	notintel
 	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
 notintel:
+	// Do nothing.
 
 	MOVQ	$1, AX
 	CPUID
 	MOVL	CX, runtime·cpuid_ecx(SB)
 	MOVL	DX, runtime·cpuid_edx(SB)
+	// Detect AVX and AVX2 as per 14.7.1  Detection of AVX2 chapter of [1]
+	// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
+	// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
+	ANDL    $0x18000000, CX // check for OSXSAVE and AVX bits
+	CMPL    CX, $0x18000000
+	JNE     noavx
+	MOVL    $0, CX
+	// For XGETBV, OSXSAVE bit is required and sufficient
+	BYTE $0x0F; BYTE $0x01; BYTE $0xD0
+	ANDL    $6, AX
+	CMPL    AX, $6 // Check for OS support of YMM registers
+	JNE     noavx
+	MOVB    $1, runtime·support_avx(SB)
+	MOVL    $7, AX
+	MOVL    $0, CX
+	CPUID
+	ANDL    $0x20, BX // check for AVX2 bit
+	CMPL    BX, $0x20
+	JNE     noavx2
+	MOVB    $1, runtime·support_avx2(SB)
+	JMP     nocpuinfo
+noavx:
+	MOVB    $0, runtime·support_avx(SB)
+noavx2:
+	MOVB    $0, runtime·support_avx2(SB)
 nocpuinfo:	
 	
 	// if there is an _cgo_init, call it.
@@ -1508,7 +1534,10 @@ TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
 	JB	small
 
 	CMPQ	R8, $63
-	JA	big_loop
+	JBE	loop
+	CMPB    runtime·support_avx2(SB), $1
+	JEQ     big_loop_avx2
+	JMP	big_loop
 loop:
 	CMPQ	R8, $16
 	JBE	_0through16
@@ -1657,6 +1686,45 @@ big_loop:
 	JBE	loop
 	JMP	big_loop
 
+	// Compare 64-bytes per loop iteration.
+	// Loop is unrolled and uses AVX2.
+big_loop_avx2:
+	MOVHDU	(SI), X2
+	MOVHDU	(DI), X3
+	MOVHDU	32(SI), X4
+	MOVHDU	32(DI), X5
+	VPCMPEQB X2, X3, X0
+	VPMOVMSKB X0, AX
+	XORL	$0xffffffff, AX
+	JNE	diff32_avx2
+	VPCMPEQB X4, X5, X6
+	VPMOVMSKB X6, AX
+	XORL	$0xffffffff, AX
+	JNE	diff64_avx2
+
+	ADDQ	$64, SI
+	ADDQ	$64, DI
+	SUBQ	$64, R8
+	CMPQ	R8, $64
+	JB	big_loop_avx2_exit
+	JMP	big_loop_avx2
+
+	// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
+diff32_avx2:
+	VZEROUPPER
+	JMP diff16
+
+	// Same as diff32_avx2, but for last 32 bytes.
+diff64_avx2:
+	VZEROUPPER
+	JMP diff48
+
+	// For <64 bytes remainder jump to normal loop.
+big_loop_avx2_exit:
+	VZEROUPPER
+	JMP loop
+
+
 TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
 	MOVQ s+0(FP), SI
 	MOVQ s_len+8(FP), BX
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 6b61cd62fa1..f1337e570ec 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -627,6 +627,8 @@ var (
 	cpuid_ecx         uint32
 	cpuid_edx         uint32
 	lfenceBeforeRdtsc bool
+	support_avx       bool
+	support_avx2      bool
 
 	goarm uint8 // set by cmd/link on arm systems
 )