diff --git a/src/cmd/asm/internal/asm/testdata/ppc64.s b/src/cmd/asm/internal/asm/testdata/ppc64.s index a12a4b55d5..2909c39094 100644 --- a/src/cmd/asm/internal/asm/testdata/ppc64.s +++ b/src/cmd/asm/internal/asm/testdata/ppc64.s @@ -932,6 +932,12 @@ label1: // VRT,VRA,VRB,VRC VPERM V3, V2, V1, V0 +// Vector bit permute, VX-form +// VRA,VRB,VRT produces +// VRT,VRA,VRB + VBPERMQ V3,V1,V2 + VBPERMD V3,V1,V2 + // Vector select, VA-form // VRA,VRB,VRC,VRT produces // VRT,VRA,VRB,VRC diff --git a/src/cmd/internal/obj/ppc64/a.out.go b/src/cmd/internal/obj/ppc64/a.out.go index 6b5a1b4351..e684281774 100644 --- a/src/cmd/internal/obj/ppc64/a.out.go +++ b/src/cmd/internal/obj/ppc64/a.out.go @@ -859,6 +859,8 @@ const ( AVCMPNEZB AVCMPNEZBCC AVPERM + AVBPERMQ + AVBPERMD AVSEL AVSPLT AVSPLTB diff --git a/src/cmd/internal/obj/ppc64/anames.go b/src/cmd/internal/obj/ppc64/anames.go index 142b53eadd..b7ca133057 100644 --- a/src/cmd/internal/obj/ppc64/anames.go +++ b/src/cmd/internal/obj/ppc64/anames.go @@ -474,6 +474,8 @@ var Anames = []string{ "VCMPNEZB", "VCMPNEZBCC", "VPERM", + "VBPERMQ", + "VBPERMD", "VSEL", "VSPLT", "VSPLTB", diff --git a/src/cmd/internal/obj/ppc64/asm9.go b/src/cmd/internal/obj/ppc64/asm9.go index 2b8efe846d..1f488d5e4d 100644 --- a/src/cmd/internal/obj/ppc64/asm9.go +++ b/src/cmd/internal/obj/ppc64/asm9.go @@ -421,6 +421,9 @@ var optab = []Optab{ /* Vector permute */ {AVPERM, C_VREG, C_VREG, C_VREG, C_VREG, 83, 4, 0}, /* vector permute, va-form */ + /* Vector bit permute */ + {AVBPERMQ, C_VREG, C_VREG, C_NONE, C_VREG, 82, 4, 0}, /* vector bit permute, vx-form */ + /* Vector select */ {AVSEL, C_VREG, C_VREG, C_VREG, C_VREG, 83, 4, 0}, /* vector select, va-form */ @@ -1378,6 +1381,9 @@ func buildop(ctxt *obj.Link) { case AVPERM: /* vperm */ opset(AVPERM, r0) + case AVBPERMQ: /* vbpermq, vbpermd */ + opset(AVBPERMD, r0) + case AVSEL: /* vsel */ opset(AVSEL, r0) @@ -4165,6 +4171,11 @@ func (c *ctxt9) oprrr(a obj.As) uint32 { case AVSRAD: return OPVX(4, 964, 0, 0) /* vsrad - v2.07 */ + case AVBPERMQ: + return OPVC(4, 1356, 0, 0) /* vbpermq - v2.07 */ + case AVBPERMD: + return OPVC(4, 1484, 0, 0) /* vbpermd - v3.00 */ + case AVCLZB: return OPVX(4, 1794, 0, 0) /* vclzb - v2.07 */ case AVCLZH: diff --git a/src/runtime/asm_ppc64x.s b/src/runtime/asm_ppc64x.s index 487187f4d8..e02ca16907 100644 --- a/src/runtime/asm_ppc64x.s +++ b/src/runtime/asm_ppc64x.s @@ -1084,24 +1084,17 @@ TEXT strings·IndexByte(SB),NOSPLIT|NOFRAME,$0-32 TEXT runtime·indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0 DCBT (R3) // Prepare cache line. - MOVD R3,R10 // Save base address for calculating the index later. + MOVD R3,R17 // Save base address for calculating the index later. RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8. RLDIMI $8,R5,$48,R5 // Replicating the byte across the register. - - // Calculate last acceptable address and check for possible overflow - // using a saturated add. - // Overflows set last acceptable address to 0xffffffffffffffff. - ADD R4,R3,R7 - SUBC R3,R7,R6 - SUBE R0,R0,R9 - MOVW R9,R6 - OR R6,R7,R7 + ADD R4,R3,R7 // Last acceptable address in R7. RLDIMI $16,R5,$32,R5 CMPU R4,$32 // Check if it's a small string (<32 bytes). Those will be processed differently. MOVD $-1,R9 - WORD $0x54661EB8 // Calculate padding in R6 (rlwinm r6,r3,3,26,28). + WORD $0x54661EB8 // Calculate padding in R6 (rlwinm r6,r3,3,26,28). RLDIMI $32,R5,$0,R5 + MOVD R7,R10 // Save last acceptable address in R10 for later. ADD $-1,R7,R7 #ifdef GOARCH_ppc64le SLD R6,R9,R9 // Prepare mask for Little Endian @@ -1110,56 +1103,142 @@ TEXT runtime·indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0 #endif BLE small_string // Jump to the small string case if it's <32 bytes. - // Case for length >32 bytes + // If we are 64-byte aligned, branch to qw_align just to get the auxiliary values + // in V0, V1 and V10, then branch to the preloop. + ANDCC $63,R3,R11 + BEQ CR0,qw_align + RLDICL $0,R3,$61,R11 + MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8. CMPB R12,R5,R3 // Check for a match. AND R9,R3,R3 // Mask bytes below s_base - RLDICL $0,R7,$61,R4 // length-1 + RLDICL $0,R7,$61,R6 // length-1 RLDICR $0,R7,$60,R7 // Last doubleword in R7 CMPU R3,$0,CR7 // If we have a match, jump to the final computation BNE CR7,done + ADD $8,R8,R8 + ADD $-8,R4,R4 + ADD R4,R11,R4 - // Check for doubleword alignment and jump to the loop setup if aligned. - MOVFL R8,CR7 - BC 12,28,loop_setup + // Check for quadword alignment + ANDCC $15,R8,R11 + BEQ CR0,qw_align - // Not aligned, so handle the second doubleword - MOVDU 8(R8),R12 + // Not aligned, so handle the next doubleword + MOVD 0(R8),R12 CMPB R12,R5,R3 CMPU R3,$0,CR7 BNE CR7,done + ADD $8,R8,R8 + ADD $-8,R4,R4 -loop_setup: - // We are now aligned to a 16-byte boundary. We will load two doublewords - // per loop iteration. The last doubleword is in R7, so our loop counter - // starts at (R7-R8)/16. - SUB R8,R7,R6 - SRD $4,R6,R6 - MOVD R6,CTR + // Either quadword aligned or 64-byte at this point. We can use LVX. +qw_align: - // Note: when we have an align directive, align this loop to 32 bytes so - // it fits in a single icache sector. + // Set up auxiliary data for the vectorized algorithm. + VSPLTISB $0,V0 // Replicate 0 across V0 + VSPLTISB $3,V10 // Use V10 as control for VBPERMQ + MTVRD R5,V1 + LVSL (R0+R0),V11 + VSLB V11,V10,V10 + VSPLTB $7,V1,V1 // Replicate byte across V1 + CMPU R4, $64 // If len <= 64, don't use the vectorized loop + BLE tail + + // We will load 4 quardwords per iteration in the loop, so check for + // 64-byte alignment. If 64-byte aligned, then branch to the preloop. + ANDCC $63,R8,R11 + BEQ CR0,preloop + + // Not 64-byte aligned. Load one quadword at a time until aligned. + LVX (R8+R0),V4 + VCMPEQUBCC V1,V4,V6 // Check for byte in V4 + BNE CR6,found_qw_align + ADD $16,R8,R8 + ADD $-16,R4,R4 + + ANDCC $63,R8,R11 + BEQ CR0,preloop + LVX (R8+R0),V4 + VCMPEQUBCC V1,V4,V6 // Check for byte in V4 + BNE CR6,found_qw_align + ADD $16,R8,R8 + ADD $-16,R4,R4 + + ANDCC $63,R8,R11 + BEQ CR0,preloop + LVX (R8+R0),V4 + VCMPEQUBCC V1,V4,V6 // Check for byte in V4 + BNE CR6,found_qw_align + ADD $-16,R4,R4 + ADD $16,R8,R8 + + // 64-byte aligned. Prepare for the main loop. +preloop: + CMPU R4,$64 + BLE tail // If len <= 64, don't use the vectorized loop + + // We are now aligned to a 64-byte boundary. We will load 4 quadwords + // per loop iteration. The last doubleword is in R10, so our loop counter + // starts at (R10-R8)/64. + SUB R8,R10,R6 + SRD $6,R6,R9 // Loop counter in R9 + MOVD R9,CTR + + MOVD $16,R11 // Load offsets for the vector loads + MOVD $32,R9 + MOVD $48,R7 + + // Main loop we will load 64 bytes per iteration loop: - // Load two doublewords, then compare and merge in a single register. We - // will check two doublewords per iteration, then find out which of them - // contains the byte later. This speeds up the search. - MOVD 8(R8),R12 - MOVDU 16(R8),R11 - CMPB R12,R5,R3 - CMPB R11,R5,R9 - OR R3,R9,R6 - CMPU R6,$0,CR7 - BNE CR7,found - BC 16,0,loop + LVX (R8+R0),V2 // Load 4 16-byte vectors + LVX (R11+R8),V3 + LVX (R9+R8),V4 + LVX (R7+R8),V5 + VCMPEQUB V1,V2,V6 // Look for byte in each vector + VCMPEQUB V1,V3,V7 + VCMPEQUB V1,V4,V8 + VCMPEQUB V1,V5,V9 + VOR V6,V7,V11 // Compress the result in a single vector + VOR V8,V9,V12 + VOR V11,V12,V11 + VCMPEQUBCC V0,V11,V11 // Check for byte + BGE CR6,found + ADD $64,R8,R8 + BC 16,0,loop // bdnz loop - // Counter zeroed, but we may have another doubleword to read - CMPU R8,R7 - BEQ notfound + // Handle the tailing bytes or R4 <= 64 + RLDICL $0,R6,$58,R4 +tail: + CMPU R4,$0 + BEQ notfound + LVX (R8+R0),V4 + VCMPEQUBCC V1,V4,V6 + BNE CR6,found_qw_align + ADD $16,R8,R8 + CMPU R4,$16,CR6 + BLE CR6,notfound + ADD $-16,R4,R4 - MOVDU 8(R8),R12 - CMPB R12,R5,R3 - CMPU R3,$0,CR6 - BNE CR6,done + LVX (R8+R0),V4 + VCMPEQUBCC V1,V4,V6 + BNE CR6,found_qw_align + ADD $16,R8,R8 + CMPU R4,$16,CR6 + BLE CR6,notfound + ADD $-16,R4,R4 + + LVX (R8+R0),V4 + VCMPEQUBCC V1,V4,V6 + BNE CR6,found_qw_align + ADD $16,R8,R8 + CMPU R4,$16,CR6 + BLE CR6,notfound + ADD $-16,R4,R4 + + LVX (R8+R0),V4 + VCMPEQUBCC V1,V4,V6 + BNE CR6,found_qw_align notfound: MOVD $-1,R3 @@ -1167,15 +1246,68 @@ notfound: RET found: - // One of the doublewords from the loop contains the byte we are looking - // for. Check the first doubleword and adjust the address if found. - CMPU R3,$0,CR6 - ADD $-8,R8,R8 - BNE CR6,done + // We will now compress the results into a single doubleword, + // so it can be moved to a GPR for the final index calculation. - // Not found, so it must be in the second doubleword of the merged pair. - MOVD R9,R3 - ADD $8,R8,R8 + // The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the + // first bit of each byte into bits 48-63. + VBPERMQ V6,V10,V6 + VBPERMQ V7,V10,V7 + VBPERMQ V8,V10,V8 + VBPERMQ V9,V10,V9 + + // Shift each 16-bit component into its correct position for + // merging into a single doubleword. +#ifdef GOARCH_ppc64le + VSLDOI $2,V7,V7,V7 + VSLDOI $4,V8,V8,V8 + VSLDOI $6,V9,V9,V9 +#else + VSLDOI $6,V6,V6,V6 + VSLDOI $4,V7,V7,V7 + VSLDOI $2,V8,V8,V8 +#endif + + // Merge V6-V9 into a single doubleword and move to a GPR. + VOR V6,V7,V11 + VOR V8,V9,V4 + VOR V4,V11,V4 + MFVRD V4,R3 + +#ifdef GOARCH_ppc64le + ADD $-1,R3,R11 + ANDN R3,R11,R11 + POPCNTD R11,R11 // Count trailing zeros (Little Endian). +#else + CNTLZD R3,R11 // Count leading zeros (Big Endian). +#endif + ADD R8,R11,R3 // Calculate byte address + +return: + SUB R17,R3 + MOVD R3,(R14) + RET + +found_qw_align: + // Use the same algorithm as above. Compress the result into + // a single doubleword and move it to a GPR for the final + // calculation. + VBPERMQ V6,V10,V6 + +#ifdef GOARCH_ppc64le + MFVRD V6,R3 + ADD $-1,R3,R11 + ANDN R3,R11,R11 + POPCNTD R11,R11 +#else + VSLDOI $6,V6,V6,V6 + MFVRD V6,R3 + CNTLZD R3,R11 +#endif + ADD R8,R11,R3 + CMPU R11,R4 + BLT return + BR notfound done: // At this point, R3 has 0xFF in the same position as the byte we are @@ -1191,17 +1323,10 @@ done: CMPU R8,R7 // Check if we are at the last doubleword. SRD $3,R11 // Convert trailing zeros to bytes. ADD R11,R8,R3 - CMPU R11,R4,CR7 // If at the last doubleword, check the byte offset. + CMPU R11,R6,CR7 // If at the last doubleword, check the byte offset. BNE return BLE CR7,return - MOVD $-1,R3 - MOVD R3,(R14) - RET - -return: - SUB R10,R3 // Calculate index. - MOVD R3,(R14) - RET + BR notfound small_string: // We unroll this loop for better performance. @@ -1212,9 +1337,9 @@ small_string: CMPB R12,R5,R3 // Check for a match. AND R9,R3,R3 // Mask bytes below s_base. CMPU R3,$0,CR7 // If we have a match, jump to the final computation. - RLDICL $0,R7,$61,R4 // length-1 + RLDICL $0,R7,$61,R6 // length-1 RLDICR $0,R7,$60,R7 // Last doubleword in R7. - CMPU R8,R7 + CMPU R8,R7 BNE CR7,done BEQ notfound // Hit length. @@ -1242,7 +1367,6 @@ small_string: MOVDU 8(R8),R12 CMPB R12,R5,R3 CMPU R3,$0,CR6 - CMPU R8,R7 BNE CR6,done BR notfound