From 6b02a1924725688b4d264065454ac5287fbed535 Mon Sep 17 00:00:00 2001 From: Ilya Tocar Date: Thu, 21 Apr 2016 18:24:12 +0300 Subject: [PATCH] strings: use SSE4.2 in strings.Index on AMD64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use PCMPESTRI instruction if available. Index-4 21.1ns ± 0% 21.1ns ± 0% ~ (all samples are equal) IndexHard1-4 395µs ± 0% 105µs ± 0% -73.53% (p=0.000 n=19+20) IndexHard2-4 300µs ± 0% 147µs ± 0% -51.11% (p=0.000 n=19+20) IndexHard3-4 665µs ± 0% 665µs ± 0% ~ (p=0.942 n=16+19) Change-Id: I4f66794164740a2b939eb1c78934e2390b489064 Reviewed-on: https://go-review.googlesource.com/22337 Run-TryBot: Ilya Tocar TryBot-Result: Gobot Gobot Reviewed-by: Russ Cox --- src/cmd/internal/obj/x86/a.out.go | 2 + src/cmd/internal/obj/x86/anames.go | 1 + src/cmd/internal/obj/x86/asm6.go | 1 + src/runtime/asm_amd64.s | 157 ++++++++++++++++++----------- 4 files changed, 103 insertions(+), 58 deletions(-) diff --git a/src/cmd/internal/obj/x86/a.out.go b/src/cmd/internal/obj/x86/a.out.go index 64bd865e42d..c41fd953e76 100644 --- a/src/cmd/internal/obj/x86/a.out.go +++ b/src/cmd/internal/obj/x86/a.out.go @@ -739,6 +739,8 @@ const ( AUNPCKLPS AXORPD AXORPS + APCMPESTRI + ARETFW ARETFL ARETFQ diff --git a/src/cmd/internal/obj/x86/anames.go b/src/cmd/internal/obj/x86/anames.go index 3b59e2f36fc..e3fef54e717 100644 --- a/src/cmd/internal/obj/x86/anames.go +++ b/src/cmd/internal/obj/x86/anames.go @@ -682,6 +682,7 @@ var Anames = []string{ "UNPCKLPS", "XORPD", "XORPS", + "PCMPESTRI", "RETFW", "RETFL", "RETFQ", diff --git a/src/cmd/internal/obj/x86/asm6.go b/src/cmd/internal/obj/x86/asm6.go index 57ef045b98e..1c7fcf37be3 100644 --- a/src/cmd/internal/obj/x86/asm6.go +++ b/src/cmd/internal/obj/x86/asm6.go @@ -1648,6 +1648,7 @@ var optab = {AROUNDSS, yaes2, Pq, [23]uint8{0x3a, 0x0a, 0}}, {APSHUFD, yxshuf, Pq, [23]uint8{0x70, 0}}, {APCLMULQDQ, yxshuf, Pq, [23]uint8{0x3a, 0x44, 0}}, + {APCMPESTRI, yxshuf, Pq, [23]uint8{0x3a, 0x61, 0}}, {AANDNL, yvex_r3, Pvex, [23]uint8{VEX_LZ_0F38_W0, 0xF2}}, {AANDNQ, yvex_r3, Pvex, [23]uint8{VEX_LZ_0F38_W1, 0xF2}}, diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s index 83db4d3e810..cdda29f347e 100644 --- a/src/runtime/asm_amd64.s +++ b/src/runtime/asm_amd64.s @@ -1666,122 +1666,126 @@ big_loop_avx2_exit: // TODO: Also use this in bytes.Index TEXT strings·indexShortStr(SB),NOSPLIT,$0-40 MOVQ s+0(FP), DI - MOVQ s_len+8(FP), CX - MOVQ c+16(FP), AX - MOVQ c_len+24(FP), BX - CMPQ BX, CX + // We want len in DX and AX, because PCMPESTRI implicitly consumes them + MOVQ s_len+8(FP), DX + MOVQ c+16(FP), BP + MOVQ c_len+24(FP), AX + CMPQ AX, DX JA fail - CMPQ BX, $2 + CMPQ DX, $16 + JAE sse42 +no_sse42: + CMPQ AX, $2 JA _3_or_more - MOVW (AX), AX - LEAQ -1(DI)(CX*1), CX + MOVW (BP), BP + LEAQ -1(DI)(DX*1), DX loop2: MOVW (DI), SI - CMPW SI,AX + CMPW SI,BP JZ success ADDQ $1,DI - CMPQ DI,CX + CMPQ DI,DX JB loop2 JMP fail _3_or_more: - CMPQ BX, $3 + CMPQ AX, $3 JA _4_or_more - MOVW 1(AX), DX - MOVW (AX), AX - LEAQ -2(DI)(CX*1), CX + MOVW 1(BP), BX + MOVW (BP), BP + LEAQ -2(DI)(DX*1), DX loop3: MOVW (DI), SI - CMPW SI,AX + CMPW SI,BP JZ partial_success3 ADDQ $1,DI - CMPQ DI,CX + CMPQ DI,DX JB loop3 JMP fail partial_success3: MOVW 1(DI), SI - CMPW SI,DX + CMPW SI,BX JZ success ADDQ $1,DI - CMPQ DI,CX + CMPQ DI,DX JB loop3 JMP fail _4_or_more: - CMPQ BX, $4 + CMPQ AX, $4 JA _5_or_more - MOVL (AX), AX - LEAQ -3(DI)(CX*1), CX + MOVL (BP), BP + LEAQ -3(DI)(DX*1), DX loop4: MOVL (DI), SI - CMPL SI,AX + CMPL SI,BP JZ success ADDQ $1,DI - CMPQ DI,CX + CMPQ DI,DX JB loop4 JMP fail _5_or_more: - CMPQ BX, $7 + CMPQ AX, $7 JA _8_or_more - LEAQ 1(DI)(CX*1), CX - SUBQ BX, CX - MOVL -4(AX)(BX*1), DX - MOVL (AX), AX + LEAQ 1(DI)(DX*1), DX + SUBQ AX, DX + MOVL -4(BP)(AX*1), BX + MOVL (BP), BP loop5to7: MOVL (DI), SI - CMPL SI,AX + CMPL SI,BP JZ partial_success5to7 ADDQ $1,DI - CMPQ DI,CX + CMPQ DI,DX JB loop5to7 JMP fail partial_success5to7: - MOVL -4(BX)(DI*1), SI - CMPL SI,DX + MOVL -4(AX)(DI*1), SI + CMPL SI,BX JZ success ADDQ $1,DI - CMPQ DI,CX + CMPQ DI,DX JB loop5to7 JMP fail _8_or_more: - CMPQ BX, $8 + CMPQ AX, $8 JA _9_or_more - MOVQ (AX), AX - LEAQ -7(DI)(CX*1), CX + MOVQ (BP), BP + LEAQ -7(DI)(DX*1), DX loop8: MOVQ (DI), SI - CMPQ SI,AX + CMPQ SI,BP JZ success ADDQ $1,DI - CMPQ DI,CX + CMPQ DI,DX JB loop8 JMP fail _9_or_more: - CMPQ BX, $16 + CMPQ AX, $16 JA _16_or_more - LEAQ 1(DI)(CX*1), CX - SUBQ BX, CX - MOVQ -8(AX)(BX*1), DX - MOVQ (AX), AX + LEAQ 1(DI)(DX*1), DX + SUBQ AX, DX + MOVQ -8(BP)(AX*1), BX + MOVQ (BP), BP loop9to15: MOVQ (DI), SI - CMPQ SI,AX + CMPQ SI,BP JZ partial_success9to15 ADDQ $1,DI - CMPQ DI,CX + CMPQ DI,DX JB loop9to15 JMP fail partial_success9to15: - MOVQ -8(BX)(DI*1), SI - CMPQ SI,DX + MOVQ -8(AX)(DI*1), SI + CMPQ SI,BX JZ success ADDQ $1,DI - CMPQ DI,CX + CMPQ DI,DX JB loop9to15 JMP fail _16_or_more: - CMPQ BX, $16 + CMPQ AX, $17 JA _17_to_31 - MOVOU (AX), X1 - LEAQ -15(DI)(CX*1), CX + MOVOU (BP), X1 + LEAQ -15(DI)(DX*1), DX loop16: MOVOU (DI), X2 PCMPEQB X1, X2 @@ -1789,14 +1793,14 @@ loop16: CMPQ SI, $0xffff JE success ADDQ $1,DI - CMPQ DI,CX + CMPQ DI,DX JB loop16 JMP fail _17_to_31: - LEAQ 1(DI)(CX*1), CX - SUBQ BX, CX - MOVOU -16(AX)(BX*1), X0 - MOVOU (AX), X1 + LEAQ 1(DI)(DX*1), DX + SUBQ AX, DX + MOVOU -16(BP)(AX*1), X0 + MOVOU (BP), X1 loop17to31: MOVOU (DI), X2 PCMPEQB X1,X2 @@ -1804,21 +1808,58 @@ loop17to31: CMPQ SI, $0xffff JE partial_success17to31 ADDQ $1,DI - CMPQ DI,CX + CMPQ DI,DX JB loop17to31 JMP fail partial_success17to31: - MOVOU -16(BX)(DI*1), X3 + MOVOU -16(AX)(DI*1), X3 PCMPEQB X0, X3 PMOVMSKB X3, SI CMPQ SI, $0xffff JE success ADDQ $1,DI - CMPQ DI,CX + CMPQ DI,DX JB loop17to31 fail: MOVQ $-1, ret+32(FP) RET +sse42: + MOVL runtime·cpuid_ecx(SB), CX + ANDL $0x100000, CX + JZ no_sse42 + CMPQ AX, $12 + // PCMPESTRI is slower than normal compare, + // so using it makes sense only if we advance 4+ bytes per compare + // This value was determined experimentally and is the ~same + // on Nehalem (first with SSE42) and Haswell. + JAE _9_or_more + LEAQ 16(BP), SI + TESTW $0xff0, SI + JEQ no_sse42 + MOVOU (BP), X1 + LEAQ -15(DI)(DX*1), SI + MOVQ $16, R9 + SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9 +loop_sse42: + // 0x0c means: unsigned byte compare (bits 0,1 are 00) + // for equality (bits 2,3 are 11) + // result is not masked or inverted (bits 4,5 are 00) + // and corresponds to first matching byte (bit 6 is 0) + PCMPESTRI $0x0c, (DI), X1 + // CX == 16 means no match, + // CX > R9 means partial match at the end of the string, + // otherwise sep is at offset CX from X1 start + CMPQ CX, R9 + JBE sse42_success + ADDQ R9, DI + CMPQ DI, SI + JB loop_sse42 + PCMPESTRI $0x0c, -1(SI), X1 + CMPQ CX, R9 + JA fail + LEAQ -1(SI), DI +sse42_success: + ADDQ CX, DI success: SUBQ s+0(FP), DI MOVQ DI, ret+32(FP)