diff --git a/src/internal/bytealg/count_ppc64x.s b/src/internal/bytealg/count_ppc64x.s index 2d2490b0246..55e02ce8a18 100644 --- a/src/internal/bytealg/count_ppc64x.s +++ b/src/internal/bytealg/count_ppc64x.s @@ -8,89 +8,147 @@ #include "textflag.h" TEXT ·Count(SB),NOSPLIT|NOFRAME,$0-40 - // R3 = byte array pointer + // R3 = byte array pointer // R4 = length - MOVBZ R6, R5 // R5 = byte - BR countbytebody<>(SB) + // R6 = byte to count + MTVRD R6, V1 // move compare byte + MOVD R6, R5 + VSPLTB $7, V1, V1 // replicate byte across V1 + BR countbytebody<>(SB) TEXT ·CountString(SB), NOSPLIT|NOFRAME, $0-32 // R3 = byte array pointer // R4 = length - MOVBZ R5, R5 // R5 = byte - BR countbytebody<>(SB) + // R5 = byte to count + MTVRD R5, V1 // move compare byte + VSPLTB $7, V1, V1 // replicate byte across V1 + BR countbytebody<>(SB) // R3: addr of string // R4: len of string // R5: byte to count +// V1: byte to count, splatted. // On exit: // R3: return value -// endianness shouldn't matter since we are just counting and order -// is irrelevant TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0 - DCBT (R3) // Prepare cache line. - MOVD R0, R18 // byte count - MOVD R3, R19 // Save base address for calculating the index later. - MOVD R4, R16 + MOVD $0, R18 // byte count - MOVD R5, R6 - RLDIMI $8, R6, $48, R6 - RLDIMI $16, R6, $32, R6 - RLDIMI $32, R6, $0, R6 // fill reg with the byte to count +#ifndef GOPPC64_power10 + RLDIMI $8, R5, $48, R5 + RLDIMI $16, R5, $32, R5 + RLDIMI $32, R5, $0, R5 // fill reg with the byte to count +#endif - VSPLTISW $3, V4 // used for shift - MTVRD R6, V1 // move compare byte - VSPLTB $7, V1, V1 // replicate byte across V1 - - CMPU R4, $32 // Check if it's a small string (<32 bytes) - BLT tail // Jump to the small string case - XXLXOR VS37, VS37, VS37 // clear V5 (aka VS37) to use as accumulator + CMPU R4, $32 // Check if it's a small string (<32 bytes) + BLT tail // Jump to the small string case + SRD $5, R4, R20 + MOVD R20, CTR + MOVD $16, R21 + XXLXOR V4, V4, V4 + XXLXOR V5, V5, V5 + PCALIGN $16 cmploop: - LXVW4X (R3), VS32 // load bytes from string + LXVD2X (R0)(R3), V0 // Count 32B per loop with two vector accumulators. + LXVD2X (R21)(R3), V2 + VCMPEQUB V2, V1, V2 + VCMPEQUB V0, V1, V0 + VPOPCNTD V2, V2 // A match is 0xFF or 0. Count the bits into doubleword buckets. + VPOPCNTD V0, V0 + VADDUDM V0, V4, V4 // Accumulate the popcounts. They are 8x the count. + VADDUDM V2, V5, V5 // The count will be fixed up afterwards. + ADD $32, R3 + BDNZ cmploop - // when the bytes match, the corresponding byte contains all 1s - VCMPEQUB V1, V0, V2 // compare bytes - VPOPCNTD V2, V3 // each double word contains its count - VADDUDM V3, V5, V5 // accumulate bit count in each double word - ADD $16, R3, R3 // increment pointer - SUB $16, R16, R16 // remaining bytes - CMP R16, $16 // at least 16 remaining? - BGE cmploop - VSRD V5, V4, V5 // shift by 3 to convert bits to bytes - VSLDOI $8, V5, V5, V6 // get the double word values from vector - MFVSRD V5, R9 - MFVSRD V6, R10 - ADD R9, R10, R9 - ADD R9, R18, R18 + VADDUDM V4, V5, V5 + MFVSRD V5, R18 + VSLDOI $8, V5, V5, V5 + MFVSRD V5, R21 + ADD R21, R18, R18 + ANDCC $31, R4, R4 + // Skip the tail processing if no bytes remaining. + BEQ tail_0 -tail: - CMP R16, $8 // 8 bytes left? - BLT small +#ifdef GOPPC64_power10 + SRD $3, R18, R18 // Fix the vector loop count before counting the tail on P10. - MOVD (R3), R12 // load 8 bytes - CMPB R12, R6, R17 // compare bytes - POPCNTD R17, R15 // bit count - SRD $3, R15, R15 // byte count - ADD R15, R18, R18 // add to byte count +tail: // Count the last 0 - 31 bytes. + CMP R4, $16 + BLE small_tail_p10 + LXV 0(R3), V0 + VCMPEQUB V0, V1, V0 + VCNTMBB V0, $1, R14 // Sum the value of bit 0 of each byte of the compare into R14. + SRD $56, R14, R14 // The result of VCNTMBB is shifted. Unshift it. + ADD R14, R18, R18 + ADD $16, R3, R3 + ANDCC $15, R4, R4 -next1: - ADD $8, R3, R3 - SUB $8, R16, R16 // remaining bytes - BR tail - -small: - CMP $0, R16 // any remaining - BEQ done - MOVBZ (R3), R12 // check each remaining byte - CMP R12, R5 - BNE next2 - ADD $1, R18 - -next2: - SUB $1, R16 - ADD $1, R3 // inc address - BR small - -done: - MOVD R18, R3 // return count +small_tail_p10: + SLD $56, R4, R6 + LXVLL R3, R6, V0 + VCMPEQUB V0, V1, V0 + VCLRRB V0, R4, V0 // If <16B being compared, clear matches of the 16-R4 bytes. + VCNTMBB V0, $1, R14 // Sum the value of bit 0 of each byte of the compare into R14. + SRD $56, R14, R14 // The result of VCNTMBB is shifted. Unshift it. + ADD R14, R18, R3 + RET + +#else +tail: // Count the last 0 - 31 bytes. + CMP R4, $16 + BLT tail_8 + MOVD (R3), R12 + MOVD 8(R3), R14 + CMPB R12, R5, R12 + CMPB R14, R5, R14 + POPCNTD R12, R12 + POPCNTD R14, R14 + ADD R12, R18, R18 + ADD R14, R18, R18 + ADD $16, R3, R3 + ADD $-16, R4, R4 + +tail_8: // Count the remaining 0 - 15 bytes. + CMP R4, $8 + BLT tail_4 + MOVD (R3), R12 + CMPB R12, R5, R12 + POPCNTD R12, R12 + ADD R12, R18, R18 + ADD $8, R3, R3 + ADD $-8, R4, R4 + +tail_4: // Count the remaining 0 - 7 bytes. + CMP R4, $4 + BLT tail_2 + MOVWZ (R3), R12 + CMPB R12, R5, R12 + SLD $32, R12, R12 // Remove non-participating matches. + POPCNTD R12, R12 + ADD R12, R18, R18 + ADD $4, R3, R3 + ADD $-4, R4, R4 + +tail_2: // Count the remaining 0 - 3 bytes. + CMP R4, $2 + BLT tail_1 + MOVHZ (R3), R12 + CMPB R12, R5, R12 + SLD $48, R12, R12 // Remove non-participating matches. + POPCNTD R12, R12 + ADD R12, R18, R18 + ADD $2, R3, R3 + ADD $-2, R4, R4 + +tail_1: // Count the remaining 0 - 1 bytes. + CMP R4, $1 + BLT tail_0 + MOVBZ (R3), R12 + CMPB R12, R5, R12 + ANDCC $0x8, R12, R12 + ADD R12, R18, R18 +#endif + +tail_0: // No remaining tail to count. + SRD $3, R18, R3 // Fixup count, it is off by 8x. RET