diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s index 5e202e7a87..9bf0646c8d 100644 --- a/src/runtime/asm_arm64.s +++ b/src/runtime/asm_arm64.s @@ -802,48 +802,123 @@ samebytes: // TEXT bytes·IndexByte(SB),NOSPLIT,$0-40 MOVD b+0(FP), R0 - MOVD b_len+8(FP), R1 - MOVBU c+24(FP), R2 // byte to find - MOVD R0, R4 // store base for later - ADD R0, R1 // end -loop: - CMP R0, R1 - BEQ notfound - MOVBU.P 1(R0), R3 - CMP R2, R3 - BNE loop - - SUB $1, R0 // R0 will be one beyond the position we want - SUB R4, R0 // remove base - MOVD R0, ret+32(FP) - RET - -notfound: - MOVD $-1, R0 - MOVD R0, ret+32(FP) - RET + MOVD b_len+8(FP), R2 + MOVBU c+24(FP), R1 + MOVD $ret+32(FP), R8 + B runtime·indexbytebody<>(SB) TEXT strings·IndexByte(SB),NOSPLIT,$0-32 MOVD s+0(FP), R0 - MOVD s_len+8(FP), R1 - MOVBU c+16(FP), R2 // byte to find - MOVD R0, R4 // store base for later - ADD R0, R1 // end -loop: - CMP R0, R1 - BEQ notfound - MOVBU.P 1(R0), R3 - CMP R2, R3 - BNE loop + MOVD s_len+8(FP), R2 + MOVBU c+16(FP), R1 + MOVD $ret+24(FP), R8 + B runtime·indexbytebody<>(SB) - SUB $1, R0 // R0 will be one beyond the position we want - SUB R4, R0 // remove base - MOVD R0, ret+24(FP) +// input: +// R0: data +// R1: byte to search +// R2: data len +// R8: address to put result +TEXT runtime·indexbytebody<>(SB),NOSPLIT,$0 + // Core algorithm: + // For each 32-byte chunk we calculate a 64-bit syndrome value, + // with two bits per byte. For each tuple, bit 0 is set if the + // relevant byte matched the requested character and bit 1 is + // not used (faster than using a 32bit syndrome). Since the bits + // in the syndrome reflect exactly the order in which things occur + // in the original string, counting trailing zeros allows to + // identify exactly which byte has matched. + + CBZ R2, fail + MOVD R0, R11 + // Magic constant 0x40100401 allows us to identify + // which lane matches the requested byte. + // 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24)) + // Different bytes have different bit masks (i.e: 1, 4, 16, 64) + MOVD $0x40100401, R5 + VMOV R1, V0.B16 + // Work with aligned 32-byte chunks + BIC $0x1f, R0, R3 + VMOV R5, V5.S4 + ANDS $0x1f, R0, R9 + AND $0x1f, R2, R10 + BEQ loop + + // Input string is not 32-byte aligned. We calculate the + // syndrome value for the aligned 32 bytes block containing + // the first bytes and mask off the irrelevant part. + VLD1.P (R3), [V1.B16, V2.B16] + SUB $0x20, R9, R4 + ADDS R4, R2, R2 + VCMEQ V0.B16, V1.B16, V3.B16 + VCMEQ V0.B16, V2.B16, V4.B16 + VAND V5.B16, V3.B16, V3.B16 + VAND V5.B16, V4.B16, V4.B16 + VADDP V4.B16, V3.B16, V6.B16 // 256->128 + VADDP V6.B16, V6.B16, V6.B16 // 128->64 + VMOV V6.D[0], R6 + // Clear the irrelevant lower bits + LSL $1, R9, R4 + LSR R4, R6, R6 + LSL R4, R6, R6 + // The first block can also be the last + BLS masklast + // Have we found something already? + CBNZ R6, tail + +loop: + VLD1.P (R3), [V1.B16, V2.B16] + SUBS $0x20, R2, R2 + VCMEQ V0.B16, V1.B16, V3.B16 + VCMEQ V0.B16, V2.B16, V4.B16 + // If we're out of data we finish regardless of the result + BLS end + // Use a fast check for the termination condition + VORR V4.B16, V3.B16, V6.B16 + VADDP V6.D2, V6.D2, V6.D2 + VMOV V6.D[0], R6 + // We're not out of data, loop if we haven't found the character + CBZ R6, loop + +end: + // Termination condition found, let's calculate the syndrome value + VAND V5.B16, V3.B16, V3.B16 + VAND V5.B16, V4.B16, V4.B16 + VADDP V4.B16, V3.B16, V6.B16 + VADDP V6.B16, V6.B16, V6.B16 + VMOV V6.D[0], R6 + // Only do the clear for the last possible block with less than 32 bytes + // Condition flags come from SUBS in the loop + BHS tail + +masklast: + // Clear the irrelevant upper bits + ADD R9, R10, R4 + AND $0x1f, R4, R4 + SUB $0x20, R4, R4 + NEG R4<<1, R4 + LSL R4, R6, R6 + LSR R4, R6, R6 + +tail: + // Check that we have found a character + CBZ R6, fail + // Count the trailing zeros using bit reversing + RBIT R6, R6 + // Compensate the last post-increment + SUB $0x20, R3, R3 + // And count the leading zeros + CLZ R6, R6 + // R6 is twice the offset into the fragment + ADD R6>>1, R3, R0 + // Compute the offset result + SUB R11, R0, R0 + MOVD R0, (R8) RET -notfound: +fail: MOVD $-1, R0 - MOVD R0, ret+24(FP) + MOVD R0, (R8) RET // Equal(a, b []byte) bool diff --git a/src/runtime/memclr_arm64.s b/src/runtime/memclr_arm64.s index 9d756bcf6d..bf954e047f 100644 --- a/src/runtime/memclr_arm64.s +++ b/src/runtime/memclr_arm64.s @@ -6,32 +6,54 @@ // void runtime·memclrNoHeapPointers(void*, uintptr) TEXT runtime·memclrNoHeapPointers(SB),NOSPLIT,$0-16 - MOVD ptr+0(FP), R3 - MOVD n+8(FP), R4 - // TODO(mwhudson): this is written this way to avoid tickling - // warnings from addpool when written as AND $7, R4, R6 (see - // https://golang.org/issue/12708) - AND $~7, R4, R5 // R5 is N&~7 - SUB R5, R4, R6 // R6 is N&7 + MOVD ptr+0(FP), R0 + MOVD n+8(FP), R1 + // If size is less than 16 bytes, use tail_zero to zero what remains + CMP $16, R1 + BLT tail_zero + // Get buffer offset into 16 byte aligned address for better performance + ANDS $15, R0, ZR + BNE unaligned_to_16 +aligned_to_16: + LSR $4, R1, R2 +zero_by_16: + STP.P (ZR, ZR), 16(R0) + SUBS $1, R2, R2 + BNE zero_by_16 - CMP $0, R5 - BEQ nowords + ANDS $15, R1, R1 + BEQ ending - ADD R3, R5, R5 + // Zero buffer with size=R1 < 16 +tail_zero: + TBZ $3, R1, tail_zero_4 + MOVD.P ZR, 8(R0) -wordloop: // TODO: Optimize for unaligned ptr. - MOVD.P $0, 8(R3) - CMP R3, R5 - BNE wordloop -nowords: - CMP $0, R6 - BEQ done +tail_zero_4: + TBZ $2, R1, tail_zero_2 + MOVW.P ZR, 4(R0) - ADD R3, R6, R6 +tail_zero_2: + TBZ $1, R1, tail_zero_1 + MOVH.P ZR, 2(R0) -byteloop: - MOVBU.P $0, 1(R3) - CMP R3, R6 - BNE byteloop -done: +tail_zero_1: + TBZ $0, R1, ending + MOVB ZR, (R0) + +ending: RET + +unaligned_to_16: + MOVD R0, R2 +head_loop: + MOVBU.P ZR, 1(R0) + ANDS $15, R0, ZR + BNE head_loop + // Adjust length for what remains + SUB R2, R0, R3 + SUB R3, R1 + // If size is less than 16 bytes, use tail_zero to zero what remains + CMP $16, R1 + BLT tail_zero + B aligned_to_16