diff --git a/src/internal/bytealg/count_amd64.s b/src/internal/bytealg/count_amd64.s index 807c289113..3a8dc3675a 100644 --- a/src/internal/bytealg/count_amd64.s +++ b/src/internal/bytealg/count_amd64.s @@ -51,8 +51,8 @@ TEXT countbody<>(SB),NOSPLIT,$0 MOVQ SI, DI - CMPQ BX, $32 - JA avx2 + CMPQ BX, $64 + JAE avx2 sse: LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes JMP sseloopentry @@ -162,44 +162,63 @@ avx2: JNE sse #endif MOVD AX, X0 - LEAQ -32(SI)(BX*1), R11 + LEAQ -64(SI)(BX*1), R11 + LEAQ (SI)(BX*1), R13 VPBROADCASTB X0, Y1 PCALIGN $32 avx2_loop: VMOVDQU (DI), Y2 + VMOVDQU 32(DI), Y4 VPCMPEQB Y1, Y2, Y3 + VPCMPEQB Y1, Y4, Y5 VPMOVMSKB Y3, DX + VPMOVMSKB Y5, CX POPCNTL DX, DX + POPCNTL CX, CX ADDQ DX, R12 - ADDQ $32, DI + ADDQ CX, R12 + ADDQ $64, DI CMPQ DI, R11 JLE avx2_loop // If last block is already processed, // skip to the end. - CMPQ DI, R11 + // + // This check is NOT an optimization; if the input length is a + // multiple of 64, we must not go through the last leg of the + // function because the bit shift count passed to SALQ below would + // be 64, which is outside of the 0-63 range supported by those + // instructions. + // + // Tests in the bytes and strings packages with input lengths that + // are multiples of 64 will break if this condition were removed. + CMPQ DI, R13 JEQ endavx - // Load address of the last 32 bytes. + // Load address of the last 64 bytes. // There is an overlap with the previous block. MOVQ R11, DI VMOVDQU (DI), Y2 + VMOVDQU 32(DI), Y4 VPCMPEQB Y1, Y2, Y3 + VPCMPEQB Y1, Y4, Y5 VPMOVMSKB Y3, DX + VPMOVMSKB Y5, CX // Exit AVX mode. VZEROUPPER + SALQ $32, CX + ORQ CX, DX - // Create mask to ignore overlap between previous 32 byte block + // Create mask to ignore overlap between previous 64 byte block // and the next. - ANDQ $31, BX - MOVQ $32,CX + ANDQ $63, BX + MOVQ $64, CX SUBQ BX, CX - MOVQ $0xFFFFFFFF, R10 - SARQ CL, R10 + MOVQ $0xFFFFFFFFFFFFFFFF, R10 SALQ CL, R10 // Apply mask ANDQ R10, DX - POPCNTL DX, DX + POPCNTQ DX, DX ADDQ DX, R12 MOVQ R12, (R8) RET