From d60166d5eea5084e0957e9028237cc87ecadbf7d Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Seo Date: Mon, 27 Feb 2017 21:32:29 -0300 Subject: [PATCH] runtime: improve IndexByte for ppc64x MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change adds a better implementation of IndexByte for ppc64x. Improvement for bytes·IndexByte: benchmark old ns/op new ns/op delta BenchmarkIndexByte/10-16 12.5 8.48 -32.16% BenchmarkIndexByte/32-16 34.4 9.85 -71.37% BenchmarkIndexByte/4K-16 3089 217 -92.98% BenchmarkIndexByte/4M-16 3154810 207051 -93.44% BenchmarkIndexByte/64M-16 50564811 5579093 -88.97% benchmark old MB/s new MB/s speedup BenchmarkIndexByte/10-16 800.41 1179.64 1.47x BenchmarkIndexByte/32-16 930.60 3249.10 3.49x BenchmarkIndexByte/4K-16 1325.71 18832.53 14.21x BenchmarkIndexByte/4M-16 1329.49 20257.29 15.24x BenchmarkIndexByte/64M-16 1327.19 12028.63 9.06x Improvement for strings·IndexByte: benchmark old ns/op new ns/op delta BenchmarkIndexByte-16 25.9 7.69 -70.31% Fixes #19030 Change-Id: Ifb82bbb3d643ec44b98eaa2d08a07f47e5c2fd11 Reviewed-on: https://go-review.googlesource.com/37670 Run-TryBot: Lynn Boger TryBot-Result: Gobot Gobot Reviewed-by: Lynn Boger --- src/runtime/asm_ppc64x.s | 202 ++++++++++++++++++++++++++++++++------- 1 file changed, 166 insertions(+), 36 deletions(-) diff --git a/src/runtime/asm_ppc64x.s b/src/runtime/asm_ppc64x.s index 4ab5dec5cd..caa000bb56 100644 --- a/src/runtime/asm_ppc64x.s +++ b/src/runtime/asm_ppc64x.s @@ -1113,53 +1113,183 @@ equal: MOVBZ R3,ret+48(FP) RET -TEXT bytes·IndexByte(SB),NOSPLIT,$0-40 - MOVD s+0(FP), R3 - MOVD s_len+8(FP), R4 - MOVBZ c+24(FP), R5 // byte to find - MOVD R3, R6 // store base for later - SUB $1, R3 - ADD R3, R4 // end-1 +TEXT bytes·IndexByte(SB),NOSPLIT|NOFRAME,$0-40 + MOVD s+0(FP), R3 // R3 = byte array pointer + MOVD s_len+8(FP), R4 // R4 = length + MOVBZ c+24(FP), R5 // R5 = byte + MOVD $ret+32(FP), R14 // R14 = &ret + BR runtime·indexbytebody<>(SB) +TEXT strings·IndexByte(SB),NOSPLIT|NOFRAME,$0-32 + MOVD s+0(FP), R3 // R3 = string + MOVD s_len+8(FP), R4 // R4 = length + MOVBZ c+16(FP), R5 // R5 = byte + MOVD $ret+24(FP), R14 // R14 = &ret + BR runtime·indexbytebody<>(SB) + +TEXT runtime·indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0 + DCBT (R3) // Prepare cache line. + MOVD R3,R10 // Save base address for calculating the index later. + RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8. + RLDIMI $8,R5,$48,R5 // Replicating the byte across the register. + + // Calculate last acceptable address and check for possible overflow + // using a saturated add. + // Overflows set last acceptable address to 0xffffffffffffffff. + ADD R4,R3,R7 + SUBC R3,R7,R6 + SUBE R0,R0,R9 + MOVW R9,R6 + OR R6,R7,R7 + + RLDIMI $16,R5,$32,R5 + CMPU R4,$32 // Check if it's a small string (<32 bytes). Those will be processed differently. + MOVD $-1,R9 + WORD $0x54661EB8 // Calculate padding in R6 (rlwinm r6,r3,3,26,28). + RLDIMI $32,R5,$0,R5 + ADD $-1,R7,R7 +#ifdef GOARCH_ppc64le + SLD R6,R9,R9 // Prepare mask for Little Endian +#else + SRD R6,R9,R9 // Same for Big Endian +#endif + BLE small_string // Jump to the small string case if it's <32 bytes. + + // Case for length >32 bytes + MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8. + CMPB R12,R5,R3 // Check for a match. + AND R9,R3,R3 // Mask bytes below s_base + RLDICL $0,R7,$61,R4 // length-1 + RLDICR $0,R7,$60,R7 // Last doubleword in R7 + CMPU R3,$0,CR7 // If we have a match, jump to the final computation + BNE CR7,done + + // Check for doubleword alignment and jump to the loop setup if aligned. + MOVFL R8,CR7 + BC 12,28,loop_setup + + // Not aligned, so handle the second doubleword + MOVDU 8(R8),R12 + CMPB R12,R5,R3 + CMPU R3,$0,CR7 + BNE CR7,done + +loop_setup: + // We are now aligned to a 16-byte boundary. We will load two doublewords + // per loop iteration. The last doubleword is in R7, so our loop counter + // starts at (R7-R8)/16. + SUB R8,R7,R6 + SRD $4,R6,R6 + MOVD R6,CTR + + // Note: when we have an align directive, align this loop to 32 bytes so + // it fits in a single icache sector. loop: - CMP R3, R4 - BEQ notfound - MOVBZU 1(R3), R7 - CMP R7, R5 - BNE loop + // Load two doublewords, then compare and merge in a single register. We + // will check two doublewords per iteration, then find out which of them + // contains the byte later. This speeds up the search. + MOVD 8(R8),R12 + MOVDU 16(R8),R11 + CMPB R12,R5,R3 + CMPB R11,R5,R9 + OR R3,R9,R6 + CMPU R6,$0,CR7 + BNE CR7,found + BC 16,0,loop - SUB R6, R3 // remove base - MOVD R3, ret+32(FP) - RET + // Counter zeroed, but we may have another doubleword to read + CMPU R8,R7 + BEQ notfound + + MOVDU 8(R8),R12 + CMPB R12,R5,R3 + CMPU R3,$0,CR6 + BNE CR6,done notfound: - MOVD $-1, R3 - MOVD R3, ret+32(FP) + MOVD $-1,R3 + MOVD R3,(R14) RET -TEXT strings·IndexByte(SB),NOSPLIT,$0-32 - MOVD p+0(FP), R3 - MOVD b_len+8(FP), R4 - MOVBZ c+16(FP), R5 // byte to find - MOVD R3, R6 // store base for later - SUB $1, R3 - ADD R3, R4 // end-1 +found: + // One of the doublewords from the loop contains the byte we are looking + // for. Check the first doubleword and adjust the address if found. + CMPU R3,$0,CR6 + ADD $-8,R8,R8 + BNE CR6,done -loop: - CMP R3, R4 + // Not found, so it must be in the second doubleword of the merged pair. + MOVD R9,R3 + ADD $8,R8,R8 + +done: + // At this point, R3 has 0xFF in the same position as the byte we are + // looking for in the doubleword. Use that to calculate the exact index + // of the byte. +#ifdef GOARCH_ppc64le + ADD $-1,R3,R11 + ANDN R3,R11,R11 + POPCNTD R11,R11 // Count trailing zeros (Little Endian). +#else + CNTLZD R3,R11 // Count leading zeros (Big Endian). +#endif + CMPU R8,R7 // Check if we are at the last doubleword. + SRD $3,R11 // Convert trailing zeros to bytes. + ADD R11,R8,R3 + CMPU R11,R4,CR7 // If at the last doubleword, check the byte offset. + BNE return + BLE CR7,return + MOVD $-1,R3 + MOVD R3,(R14) + RET + +return: + SUB R10,R3 // Calculate index. + MOVD R3,(R14) + RET + +small_string: + // We unroll this loop for better performance. + CMPU R4,$0 // Check for length=0 BEQ notfound - MOVBZU 1(R3), R7 - CMP R7, R5 - BNE loop - SUB R6, R3 // remove base - MOVD R3, ret+24(FP) - RET + MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8. + CMPB R12,R5,R3 // Check for a match. + AND R9,R3,R3 // Mask bytes below s_base. + CMPU R3,$0,CR7 // If we have a match, jump to the final computation. + RLDICL $0,R7,$61,R4 // length-1 + RLDICR $0,R7,$60,R7 // Last doubleword in R7. + CMPU R8,R7 + BNE CR7,done + BEQ notfound // Hit length. -notfound: - MOVD $-1, R3 - MOVD R3, ret+24(FP) - RET + MOVDU 8(R8),R12 + CMPB R12,R5,R3 + CMPU R3,$0,CR6 + CMPU R8,R7 + BNE CR6,done + BEQ notfound + + MOVDU 8(R8),R12 + CMPB R12,R5,R3 + CMPU R3,$0,CR6 + CMPU R8,R7 + BNE CR6,done + BEQ notfound + + MOVDU 8(R8),R12 + CMPB R12,R5,R3 + CMPU R3,$0,CR6 + CMPU R8,R7 + BNE CR6,done + BEQ notfound + + MOVDU 8(R8),R12 + CMPB R12,R5,R3 + CMPU R3,$0,CR6 + CMPU R8,R7 + BNE CR6,done + BR notfound TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40 MOVD s1_base+0(FP), R5