mirror of
https://github.com/golang/go
synced 2024-11-18 02:44:48 -07:00
internal/bytealg: add SIMD byte count implementation for s390x
Add a 'single lane' SIMD implemementation of the single byte count function for use on machines that support the vector facility. This allows up to 16 bytes to be counted per loop iteration. We can probably improve performance further by adding more 'lanes' (i.e. counting more bytes in parallel) however this will increase the complexity of the function so I'm not sure it is worth doing yet. name old speed new speed delta pkg:strings goos:linux goarch:s390x CountByte/10 789MB/s ± 0% 1131MB/s ± 0% +43.44% (p=0.000 n=9+9) CountByte/32 936MB/s ± 0% 3236MB/s ± 0% +245.87% (p=0.000 n=8+9) CountByte/4096 1.06GB/s ± 0% 21.26GB/s ± 0% +1907.07% (p=0.000 n=10+10) CountByte/4194304 1.06GB/s ± 0% 20.54GB/s ± 0% +1838.50% (p=0.000 n=10+10) CountByte/67108864 1.06GB/s ± 0% 18.31GB/s ± 0% +1629.51% (p=0.000 n=10+10) pkg:bytes goos:linux goarch:s390x CountSingle/10 800MB/s ± 0% 986MB/s ± 0% +23.21% (p=0.000 n=9+10) CountSingle/32 925MB/s ± 0% 2744MB/s ± 0% +196.55% (p=0.000 n=9+10) CountSingle/4K 1.26GB/s ± 0% 19.44GB/s ± 0% +1445.59% (p=0.000 n=10+10) CountSingle/4M 1.26GB/s ± 0% 20.28GB/s ± 0% +1510.26% (p=0.000 n=8+10) CountSingle/64M 1.23GB/s ± 0% 17.78GB/s ± 0% +1350.67% (p=0.000 n=9+10) Change-Id: I230d57905db92a8fdfc50b1d5be338941ae3a7a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/199979 Run-TryBot: Michael Munday <mike.munday@ibm.com> Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
parent
d9ee4b2859
commit
b6245cef3c
@ -2,7 +2,7 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
// +build !amd64,!arm,!arm64,!ppc64le,!ppc64
|
// +build !amd64,!arm,!arm64,!ppc64le,!ppc64,!s390x
|
||||||
|
|
||||||
package bytealg
|
package bytealg
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
// +build amd64 arm arm64 ppc64le ppc64
|
// +build amd64 arm arm64 ppc64le ppc64 s390x
|
||||||
|
|
||||||
package bytealg
|
package bytealg
|
||||||
|
|
||||||
|
169
src/internal/bytealg/count_s390x.s
Normal file
169
src/internal/bytealg/count_s390x.s
Normal file
@ -0,0 +1,169 @@
|
|||||||
|
// Copyright 2019 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
#include "go_asm.h"
|
||||||
|
#include "textflag.h"
|
||||||
|
|
||||||
|
// condition code masks
|
||||||
|
#define EQ 8
|
||||||
|
#define NE 7
|
||||||
|
|
||||||
|
// register assignments
|
||||||
|
#define R_ZERO R0
|
||||||
|
#define R_VAL R1
|
||||||
|
#define R_TMP R2
|
||||||
|
#define R_PTR R3
|
||||||
|
#define R_LEN R4
|
||||||
|
#define R_CHAR R5
|
||||||
|
#define R_RET R6
|
||||||
|
#define R_ITER R7
|
||||||
|
#define R_CNT R8
|
||||||
|
#define R_MPTR R9
|
||||||
|
|
||||||
|
// vector register assignments
|
||||||
|
#define V_ZERO V0
|
||||||
|
#define V_CHAR V1
|
||||||
|
#define V_MASK V2
|
||||||
|
#define V_VAL V3
|
||||||
|
#define V_CNT V4
|
||||||
|
|
||||||
|
// mask for trailing bytes in vector implementation
|
||||||
|
GLOBL countbytemask<>(SB), RODATA, $16
|
||||||
|
DATA countbytemask<>+0(SB)/8, $0x0101010101010101
|
||||||
|
DATA countbytemask<>+8(SB)/8, $0x0101010101010101
|
||||||
|
|
||||||
|
// func Count(b []byte, c byte) int
|
||||||
|
TEXT ·Count(SB), NOSPLIT|NOFRAME, $0-40
|
||||||
|
LMG b+0(FP), R_PTR, R_LEN
|
||||||
|
MOVBZ c+24(FP), R_CHAR
|
||||||
|
MOVD $ret+32(FP), R_RET
|
||||||
|
BR countbytebody<>(SB)
|
||||||
|
|
||||||
|
// func CountString(s string, c byte) int
|
||||||
|
TEXT ·CountString(SB), NOSPLIT|NOFRAME, $0-32
|
||||||
|
LMG s+0(FP), R_PTR, R_LEN
|
||||||
|
MOVBZ c+16(FP), R_CHAR
|
||||||
|
MOVD $ret+24(FP), R_RET
|
||||||
|
BR countbytebody<>(SB)
|
||||||
|
|
||||||
|
// input:
|
||||||
|
// R_PTR = address of array of bytes
|
||||||
|
// R_LEN = number of bytes in array
|
||||||
|
// R_CHAR = byte value to count zero (extended to register width)
|
||||||
|
// R_RET = address of return value
|
||||||
|
TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
|
||||||
|
MOVD $internal∕cpu·S390X+const_offsetS390xHasVX(SB), R_TMP
|
||||||
|
MOVD $countbytemask<>(SB), R_MPTR
|
||||||
|
CGIJ $EQ, R_LEN, $0, ret0 // return if length is 0.
|
||||||
|
SRD $4, R_LEN, R_ITER // R_ITER is the number of 16-byte chunks
|
||||||
|
MOVBZ (R_TMP), R_TMP // load bool indicating support for vector facility
|
||||||
|
CGIJ $EQ, R_TMP, $0, novx // jump to scalar code if the vector facility is not available
|
||||||
|
|
||||||
|
// Start of vector code (have vector facility).
|
||||||
|
//
|
||||||
|
// Set R_LEN to be the length mod 16 minus 1 to use as an index for
|
||||||
|
// vector 'load with length' (VLL). It will be in the range [-1,14].
|
||||||
|
// Also replicate c across a 16-byte vector and initialize V_ZERO.
|
||||||
|
ANDW $0xf, R_LEN
|
||||||
|
VLVGB $0, R_CHAR, V_CHAR // V_CHAR = [16]byte{c, 0, ..., 0, 0}
|
||||||
|
VZERO V_ZERO // V_ZERO = [1]uint128{0}
|
||||||
|
ADDW $-1, R_LEN
|
||||||
|
VREPB $0, V_CHAR, V_CHAR // V_CHAR = [16]byte{c, c, ..., c, c}
|
||||||
|
|
||||||
|
// Jump to loop if we have more than 15 bytes to process.
|
||||||
|
CGIJ $NE, R_ITER, $0, vxchunks
|
||||||
|
|
||||||
|
// Load 1-15 bytes and corresponding mask.
|
||||||
|
// Note: only the low 32-bits of R_LEN are used for the index.
|
||||||
|
VLL R_LEN, (R_PTR), V_VAL
|
||||||
|
VLL R_LEN, (R_MPTR), V_MASK
|
||||||
|
|
||||||
|
// Compare each byte in input chunk against byte to be counted.
|
||||||
|
// Each byte element will be set to either 0 (no match) or 1 (match).
|
||||||
|
VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00
|
||||||
|
VN V_MASK, V_VAL, V_VAL // mask out most significant 7 bits
|
||||||
|
|
||||||
|
// Accumulate matched byte count in 128-bit integer value.
|
||||||
|
VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
|
||||||
|
VSUMQF V_VAL, V_ZERO, V_CNT // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
|
||||||
|
|
||||||
|
// Return rightmost (lowest) 64-bit part of accumulator.
|
||||||
|
VSTEG $1, V_CNT, (R_RET)
|
||||||
|
RET
|
||||||
|
|
||||||
|
vxchunks:
|
||||||
|
// Load 0x01 into every byte element in the 16-byte mask vector.
|
||||||
|
VREPIB $1, V_MASK // V_MASK = [16]byte{1, 1, ..., 1, 1}
|
||||||
|
VZERO V_CNT // intial uint128 count of 0
|
||||||
|
|
||||||
|
vxloop:
|
||||||
|
// Load input bytes in 16-byte chunks.
|
||||||
|
VL (R_PTR), V_VAL
|
||||||
|
|
||||||
|
// Compare each byte in input chunk against byte to be counted.
|
||||||
|
// Each byte element will be set to either 0 (no match) or 1 (match).
|
||||||
|
VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00
|
||||||
|
VN V_MASK, V_VAL, V_VAL // mask out most significant 7 bits
|
||||||
|
|
||||||
|
// Increment input string address.
|
||||||
|
MOVD $16(R_PTR), R_PTR
|
||||||
|
|
||||||
|
// Accumulate matched byte count in 128-bit integer value.
|
||||||
|
VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
|
||||||
|
VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
|
||||||
|
VAQ V_VAL, V_CNT, V_CNT // accumulate
|
||||||
|
|
||||||
|
// Repeat until all 16-byte chunks are done.
|
||||||
|
BRCTG R_ITER, vxloop
|
||||||
|
|
||||||
|
// Skip to end if there are no trailing bytes.
|
||||||
|
CIJ $EQ, R_LEN, $-1, vxret
|
||||||
|
|
||||||
|
// Load 1-15 bytes and corresponding mask.
|
||||||
|
// Note: only the low 32-bits of R_LEN are used for the index.
|
||||||
|
VLL R_LEN, (R_PTR), V_VAL
|
||||||
|
VLL R_LEN, (R_MPTR), V_MASK
|
||||||
|
|
||||||
|
// Compare each byte in input chunk against byte to be counted.
|
||||||
|
// Each byte element will be set to either 0 (no match) or 1 (match).
|
||||||
|
VCEQB V_CHAR, V_VAL, V_VAL
|
||||||
|
VN V_MASK, V_VAL, V_VAL
|
||||||
|
|
||||||
|
// Accumulate matched byte count in 128-bit integer value.
|
||||||
|
VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
|
||||||
|
VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
|
||||||
|
VAQ V_VAL, V_CNT, V_CNT // accumulate
|
||||||
|
|
||||||
|
vxret:
|
||||||
|
// Return rightmost (lowest) 64-bit part of accumulator.
|
||||||
|
VSTEG $1, V_CNT, (R_RET)
|
||||||
|
RET
|
||||||
|
|
||||||
|
novx:
|
||||||
|
// Start of non-vector code (the vector facility not available).
|
||||||
|
//
|
||||||
|
// Initialise counter and constant zero.
|
||||||
|
MOVD $0, R_CNT
|
||||||
|
MOVD $0, R_ZERO
|
||||||
|
|
||||||
|
loop:
|
||||||
|
// Read 1-byte from input and compare.
|
||||||
|
// Note: avoid putting LOCGR in critical path.
|
||||||
|
MOVBZ (R_PTR), R_VAL
|
||||||
|
MOVD $1, R_TMP
|
||||||
|
MOVD $1(R_PTR), R_PTR
|
||||||
|
CMPW R_VAL, R_CHAR
|
||||||
|
LOCGR $NE, R_ZERO, R_TMP // select 0 if no match (1 if there is a match)
|
||||||
|
ADD R_TMP, R_CNT // accumulate 64-bit result
|
||||||
|
|
||||||
|
// Repeat until all bytes have been checked.
|
||||||
|
BRCTG R_LEN, loop
|
||||||
|
|
||||||
|
ret:
|
||||||
|
MOVD R_CNT, (R_RET)
|
||||||
|
RET
|
||||||
|
|
||||||
|
ret0:
|
||||||
|
MOVD $0, (R_RET)
|
||||||
|
RET
|
Loading…
Reference in New Issue
Block a user