mirror of
https://github.com/golang/go
synced 2024-09-25 01:20:13 -06:00
strings: use SSE4.2 in strings.Index on AMD64
Use PCMPESTRI instruction if available. Index-4 21.1ns ± 0% 21.1ns ± 0% ~ (all samples are equal) IndexHard1-4 395µs ± 0% 105µs ± 0% -73.53% (p=0.000 n=19+20) IndexHard2-4 300µs ± 0% 147µs ± 0% -51.11% (p=0.000 n=19+20) IndexHard3-4 665µs ± 0% 665µs ± 0% ~ (p=0.942 n=16+19) Change-Id: I4f66794164740a2b939eb1c78934e2390b489064 Reviewed-on: https://go-review.googlesource.com/22337 Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Russ Cox <rsc@golang.org>
This commit is contained in:
parent
d78c84c419
commit
6b02a19247
@ -739,6 +739,8 @@ const (
|
||||
AUNPCKLPS
|
||||
AXORPD
|
||||
AXORPS
|
||||
APCMPESTRI
|
||||
|
||||
ARETFW
|
||||
ARETFL
|
||||
ARETFQ
|
||||
|
@ -682,6 +682,7 @@ var Anames = []string{
|
||||
"UNPCKLPS",
|
||||
"XORPD",
|
||||
"XORPS",
|
||||
"PCMPESTRI",
|
||||
"RETFW",
|
||||
"RETFL",
|
||||
"RETFQ",
|
||||
|
@ -1648,6 +1648,7 @@ var optab =
|
||||
{AROUNDSS, yaes2, Pq, [23]uint8{0x3a, 0x0a, 0}},
|
||||
{APSHUFD, yxshuf, Pq, [23]uint8{0x70, 0}},
|
||||
{APCLMULQDQ, yxshuf, Pq, [23]uint8{0x3a, 0x44, 0}},
|
||||
{APCMPESTRI, yxshuf, Pq, [23]uint8{0x3a, 0x61, 0}},
|
||||
|
||||
{AANDNL, yvex_r3, Pvex, [23]uint8{VEX_LZ_0F38_W0, 0xF2}},
|
||||
{AANDNQ, yvex_r3, Pvex, [23]uint8{VEX_LZ_0F38_W1, 0xF2}},
|
||||
|
@ -1666,122 +1666,126 @@ big_loop_avx2_exit:
|
||||
// TODO: Also use this in bytes.Index
|
||||
TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
|
||||
MOVQ s+0(FP), DI
|
||||
MOVQ s_len+8(FP), CX
|
||||
MOVQ c+16(FP), AX
|
||||
MOVQ c_len+24(FP), BX
|
||||
CMPQ BX, CX
|
||||
// We want len in DX and AX, because PCMPESTRI implicitly consumes them
|
||||
MOVQ s_len+8(FP), DX
|
||||
MOVQ c+16(FP), BP
|
||||
MOVQ c_len+24(FP), AX
|
||||
CMPQ AX, DX
|
||||
JA fail
|
||||
CMPQ BX, $2
|
||||
CMPQ DX, $16
|
||||
JAE sse42
|
||||
no_sse42:
|
||||
CMPQ AX, $2
|
||||
JA _3_or_more
|
||||
MOVW (AX), AX
|
||||
LEAQ -1(DI)(CX*1), CX
|
||||
MOVW (BP), BP
|
||||
LEAQ -1(DI)(DX*1), DX
|
||||
loop2:
|
||||
MOVW (DI), SI
|
||||
CMPW SI,AX
|
||||
CMPW SI,BP
|
||||
JZ success
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,CX
|
||||
CMPQ DI,DX
|
||||
JB loop2
|
||||
JMP fail
|
||||
_3_or_more:
|
||||
CMPQ BX, $3
|
||||
CMPQ AX, $3
|
||||
JA _4_or_more
|
||||
MOVW 1(AX), DX
|
||||
MOVW (AX), AX
|
||||
LEAQ -2(DI)(CX*1), CX
|
||||
MOVW 1(BP), BX
|
||||
MOVW (BP), BP
|
||||
LEAQ -2(DI)(DX*1), DX
|
||||
loop3:
|
||||
MOVW (DI), SI
|
||||
CMPW SI,AX
|
||||
CMPW SI,BP
|
||||
JZ partial_success3
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,CX
|
||||
CMPQ DI,DX
|
||||
JB loop3
|
||||
JMP fail
|
||||
partial_success3:
|
||||
MOVW 1(DI), SI
|
||||
CMPW SI,DX
|
||||
CMPW SI,BX
|
||||
JZ success
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,CX
|
||||
CMPQ DI,DX
|
||||
JB loop3
|
||||
JMP fail
|
||||
_4_or_more:
|
||||
CMPQ BX, $4
|
||||
CMPQ AX, $4
|
||||
JA _5_or_more
|
||||
MOVL (AX), AX
|
||||
LEAQ -3(DI)(CX*1), CX
|
||||
MOVL (BP), BP
|
||||
LEAQ -3(DI)(DX*1), DX
|
||||
loop4:
|
||||
MOVL (DI), SI
|
||||
CMPL SI,AX
|
||||
CMPL SI,BP
|
||||
JZ success
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,CX
|
||||
CMPQ DI,DX
|
||||
JB loop4
|
||||
JMP fail
|
||||
_5_or_more:
|
||||
CMPQ BX, $7
|
||||
CMPQ AX, $7
|
||||
JA _8_or_more
|
||||
LEAQ 1(DI)(CX*1), CX
|
||||
SUBQ BX, CX
|
||||
MOVL -4(AX)(BX*1), DX
|
||||
MOVL (AX), AX
|
||||
LEAQ 1(DI)(DX*1), DX
|
||||
SUBQ AX, DX
|
||||
MOVL -4(BP)(AX*1), BX
|
||||
MOVL (BP), BP
|
||||
loop5to7:
|
||||
MOVL (DI), SI
|
||||
CMPL SI,AX
|
||||
CMPL SI,BP
|
||||
JZ partial_success5to7
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,CX
|
||||
CMPQ DI,DX
|
||||
JB loop5to7
|
||||
JMP fail
|
||||
partial_success5to7:
|
||||
MOVL -4(BX)(DI*1), SI
|
||||
CMPL SI,DX
|
||||
MOVL -4(AX)(DI*1), SI
|
||||
CMPL SI,BX
|
||||
JZ success
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,CX
|
||||
CMPQ DI,DX
|
||||
JB loop5to7
|
||||
JMP fail
|
||||
_8_or_more:
|
||||
CMPQ BX, $8
|
||||
CMPQ AX, $8
|
||||
JA _9_or_more
|
||||
MOVQ (AX), AX
|
||||
LEAQ -7(DI)(CX*1), CX
|
||||
MOVQ (BP), BP
|
||||
LEAQ -7(DI)(DX*1), DX
|
||||
loop8:
|
||||
MOVQ (DI), SI
|
||||
CMPQ SI,AX
|
||||
CMPQ SI,BP
|
||||
JZ success
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,CX
|
||||
CMPQ DI,DX
|
||||
JB loop8
|
||||
JMP fail
|
||||
_9_or_more:
|
||||
CMPQ BX, $16
|
||||
CMPQ AX, $16
|
||||
JA _16_or_more
|
||||
LEAQ 1(DI)(CX*1), CX
|
||||
SUBQ BX, CX
|
||||
MOVQ -8(AX)(BX*1), DX
|
||||
MOVQ (AX), AX
|
||||
LEAQ 1(DI)(DX*1), DX
|
||||
SUBQ AX, DX
|
||||
MOVQ -8(BP)(AX*1), BX
|
||||
MOVQ (BP), BP
|
||||
loop9to15:
|
||||
MOVQ (DI), SI
|
||||
CMPQ SI,AX
|
||||
CMPQ SI,BP
|
||||
JZ partial_success9to15
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,CX
|
||||
CMPQ DI,DX
|
||||
JB loop9to15
|
||||
JMP fail
|
||||
partial_success9to15:
|
||||
MOVQ -8(BX)(DI*1), SI
|
||||
CMPQ SI,DX
|
||||
MOVQ -8(AX)(DI*1), SI
|
||||
CMPQ SI,BX
|
||||
JZ success
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,CX
|
||||
CMPQ DI,DX
|
||||
JB loop9to15
|
||||
JMP fail
|
||||
_16_or_more:
|
||||
CMPQ BX, $16
|
||||
CMPQ AX, $17
|
||||
JA _17_to_31
|
||||
MOVOU (AX), X1
|
||||
LEAQ -15(DI)(CX*1), CX
|
||||
MOVOU (BP), X1
|
||||
LEAQ -15(DI)(DX*1), DX
|
||||
loop16:
|
||||
MOVOU (DI), X2
|
||||
PCMPEQB X1, X2
|
||||
@ -1789,14 +1793,14 @@ loop16:
|
||||
CMPQ SI, $0xffff
|
||||
JE success
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,CX
|
||||
CMPQ DI,DX
|
||||
JB loop16
|
||||
JMP fail
|
||||
_17_to_31:
|
||||
LEAQ 1(DI)(CX*1), CX
|
||||
SUBQ BX, CX
|
||||
MOVOU -16(AX)(BX*1), X0
|
||||
MOVOU (AX), X1
|
||||
LEAQ 1(DI)(DX*1), DX
|
||||
SUBQ AX, DX
|
||||
MOVOU -16(BP)(AX*1), X0
|
||||
MOVOU (BP), X1
|
||||
loop17to31:
|
||||
MOVOU (DI), X2
|
||||
PCMPEQB X1,X2
|
||||
@ -1804,21 +1808,58 @@ loop17to31:
|
||||
CMPQ SI, $0xffff
|
||||
JE partial_success17to31
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,CX
|
||||
CMPQ DI,DX
|
||||
JB loop17to31
|
||||
JMP fail
|
||||
partial_success17to31:
|
||||
MOVOU -16(BX)(DI*1), X3
|
||||
MOVOU -16(AX)(DI*1), X3
|
||||
PCMPEQB X0, X3
|
||||
PMOVMSKB X3, SI
|
||||
CMPQ SI, $0xffff
|
||||
JE success
|
||||
ADDQ $1,DI
|
||||
CMPQ DI,CX
|
||||
CMPQ DI,DX
|
||||
JB loop17to31
|
||||
fail:
|
||||
MOVQ $-1, ret+32(FP)
|
||||
RET
|
||||
sse42:
|
||||
MOVL runtime·cpuid_ecx(SB), CX
|
||||
ANDL $0x100000, CX
|
||||
JZ no_sse42
|
||||
CMPQ AX, $12
|
||||
// PCMPESTRI is slower than normal compare,
|
||||
// so using it makes sense only if we advance 4+ bytes per compare
|
||||
// This value was determined experimentally and is the ~same
|
||||
// on Nehalem (first with SSE42) and Haswell.
|
||||
JAE _9_or_more
|
||||
LEAQ 16(BP), SI
|
||||
TESTW $0xff0, SI
|
||||
JEQ no_sse42
|
||||
MOVOU (BP), X1
|
||||
LEAQ -15(DI)(DX*1), SI
|
||||
MOVQ $16, R9
|
||||
SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
|
||||
loop_sse42:
|
||||
// 0x0c means: unsigned byte compare (bits 0,1 are 00)
|
||||
// for equality (bits 2,3 are 11)
|
||||
// result is not masked or inverted (bits 4,5 are 00)
|
||||
// and corresponds to first matching byte (bit 6 is 0)
|
||||
PCMPESTRI $0x0c, (DI), X1
|
||||
// CX == 16 means no match,
|
||||
// CX > R9 means partial match at the end of the string,
|
||||
// otherwise sep is at offset CX from X1 start
|
||||
CMPQ CX, R9
|
||||
JBE sse42_success
|
||||
ADDQ R9, DI
|
||||
CMPQ DI, SI
|
||||
JB loop_sse42
|
||||
PCMPESTRI $0x0c, -1(SI), X1
|
||||
CMPQ CX, R9
|
||||
JA fail
|
||||
LEAQ -1(SI), DI
|
||||
sse42_success:
|
||||
ADDQ CX, DI
|
||||
success:
|
||||
SUBQ s+0(FP), DI
|
||||
MOVQ DI, ret+32(FP)
|
||||
|
Loading…
Reference in New Issue
Block a user