diff --git a/src/runtime/memclr_ppc64x.s b/src/runtime/memclr_ppc64x.s index 3b23ce89d8..072963f756 100644 --- a/src/runtime/memclr_ppc64x.s +++ b/src/runtime/memclr_ppc64x.s @@ -14,34 +14,68 @@ TEXT runtime·memclrNoHeapPointers(SB), NOSPLIT|NOFRAME, $0-16 // Determine if there are doublewords to clear check: ANDCC $7, R4, R5 // R5: leftover bytes to clear - SRAD $3, R4, R6 // R6: double words to clear + SRD $3, R4, R6 // R6: double words to clear CMP R6, $0, CR1 // CR1[EQ] set if no double words - BC 12, 6, nozerolarge // only single bytes - MOVD R6, CTR // R6 = number of double words - SRADCC $2, R6, R7 // 32 byte chunks? - BNE zero32setup + BC 12, 6, nozerolarge // only single bytes + CMP R4, $512 + BLT under512 // special case for < 512 + ANDCC $127, R3, R8 // check for 128 alignment of address + BEQ zero512setup + + ANDCC $7, R3, R15 + BEQ zero512xsetup // at least 8 byte aligned + + // zero bytes up to 8 byte alignment + + ANDCC $1, R3, R15 // check for byte alignment + BEQ byte2 + MOVB R0, 0(R3) // zero 1 byte + ADD $1, R3 // bump ptr by 1 + ADD $-1, R4 + +byte2: + ANDCC $2, R3, R15 // check for 2 byte alignment + BEQ byte4 + MOVH R0, 0(R3) // zero 2 bytes + ADD $2, R3 // bump ptr by 2 + ADD $-2, R4 + +byte4: + ANDCC $4, R3, R15 // check for 4 byte alignment + BEQ zero512xsetup + MOVW R0, 0(R3) // zero 4 bytes + ADD $4, R3 // bump ptr by 4 + ADD $-4, R4 + BR zero512xsetup // ptr should now be 8 byte aligned + +under512: + MOVD R6, CTR // R6 = number of double words + SRDCC $2, R6, R7 // 32 byte chunks? + BNE zero32setup // Clear double words zero8: MOVD R0, 0(R3) // double word ADD $8, R3 + ADD $-8, R4 BC 16, 0, zero8 // dec ctr, br zero8 if ctr not 0 - BR nozerolarge // handle remainder + BR nozerolarge // handle leftovers // Prepare to clear 32 bytes at a time. zero32setup: - DCBTST (R3) // prepare data cache - MOVD R7, CTR // number of 32 byte chunks + DCBTST (R3) // prepare data cache + XXLXOR VS32, VS32, VS32 // clear VS32 (V0) + MOVD R7, CTR // number of 32 byte chunks + MOVD $16, R8 zero32: - MOVD R0, 0(R3) // clear 4 double words - MOVD R0, 8(R3) - MOVD R0, 16(R3) - MOVD R0, 24(R3) + STXVD2X VS32, (R3+R0) // store 16 bytes + STXVD2X VS32, (R3+R8) ADD $32, R3 + ADD $-32, R4 BC 16, 0, zero32 // dec ctr, br zero32 if ctr not 0 RLDCLCC $61, R4, $3, R6 // remaining doublewords BEQ nozerolarge @@ -49,8 +83,8 @@ zero32: BR zero8 nozerolarge: - CMP R5, $0 // any remaining bytes - BC 4, 1, LR // ble lr + ANDCC $7, R4, R5 // any remaining bytes + BC 4, 1, LR // ble lr zerotail: MOVD R5, CTR // set up to clear tail bytes @@ -60,3 +94,70 @@ zerotailloop: ADD $1, R3 BC 16, 0, zerotailloop // dec ctr, br zerotailloop if ctr not 0 RET + +zero512xsetup: // 512 chunk with extra needed + ANDCC $8, R3, R11 // 8 byte alignment? + BEQ zero512setup16 + MOVD R0, 0(R3) // clear 8 bytes + ADD $8, R3 // update ptr to next 8 + ADD $-8, R4 // dec count by 8 + +zero512setup16: + ANDCC $127, R3, R14 // < 128 byte alignment + BEQ zero512setup // handle 128 byte alignment + MOVD $128, R15 + SUB R14, R15, R14 // find increment to 128 alignment + SRD $4, R14, R15 // number of 16 byte chunks + +zero512presetup: + MOVD R15, CTR // loop counter of 16 bytes + XXLXOR VS32, VS32, VS32 // clear VS32 (V0) + +zero512preloop: // clear up to 128 alignment + STXVD2X VS32, (R3+R0) // clear 16 bytes + ADD $16, R3 // update ptr + ADD $-16, R4 // dec count + BC 16, 0, zero512preloop + +zero512setup: // setup for dcbz loop + CMP R4, $512 // check if at least 512 + BLT remain + SRD $9, R4, R8 // loop count for 512 chunks + MOVD R8, CTR // set up counter + MOVD $128, R9 // index regs for 128 bytes + MOVD $256, R10 + MOVD $384, R11 + +zero512: + DCBZ (R3+R0) // clear first chunk + DCBZ (R3+R9) // clear second chunk + DCBZ (R3+R10) // clear third chunk + DCBZ (R3+R11) // clear fourth chunk + ADD $512, R3 + ADD $-512, R4 + BC 16, 0, zero512 + +remain: + CMP R4, $128 // check if 128 byte chunks left + BLT smaller + DCBZ (R3+R0) // clear 128 + ADD $128, R3 + ADD $-128, R4 + BR remain + +smaller: + ANDCC $127, R4, R7 // find leftovers + BEQ done + CMP R7, $64 // more than 64, do 32 at a time + BLT zero8setup // less than 64, do 8 at a time + SRD $5, R7, R7 // set up counter for 32 + BR zero32setup + +zero8setup: + SRDCC $3, R7, R7 // less than 8 bytes + BEQ nozerolarge + MOVD R7, CTR + BR zero8 + +done: + RET diff --git a/src/runtime/memmove_ppc64x.s b/src/runtime/memmove_ppc64x.s index b79f76d388..60cbcc41ec 100644 --- a/src/runtime/memmove_ppc64x.s +++ b/src/runtime/memmove_ppc64x.s @@ -16,7 +16,7 @@ TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24 // copy so a more efficient move can be done check: ANDCC $7, R5, R7 // R7: bytes to copy - SRAD $3, R5, R6 // R6: double words to copy + SRD $3, R5, R6 // R6: double words to copy CMP R6, $0, CR1 // CR1[EQ] set if no double words to copy // Determine overlap by subtracting dest - src and comparing against the @@ -31,9 +31,9 @@ check: // Copying forward if no overlap. BC 12, 6, noforwardlarge // "BEQ CR1, noforwardlarge" - MOVD R6,CTR // R6 = number of double words - SRADCC $2,R6,R8 // 32 byte chunks? + SRDCC $2,R6,R8 // 32 byte chunks? BNE forward32setup // + MOVD R6,CTR // R6 = number of double words // Move double words @@ -51,17 +51,14 @@ forward32setup: DCBTST (R3) // prepare data cache DCBT (R4) MOVD R8, CTR // double work count + MOVD $16, R8 forward32: - MOVD 0(R4), R8 // load 4 double words - MOVD 8(R4), R9 - MOVD 16(R4), R14 - MOVD 24(R4), R15 - ADD $32,R4 - MOVD R8, 0(R3) // store those 4 - MOVD R9, 8(R3) - MOVD R14,16(R3) - MOVD R15,24(R3) + LXVD2X (R4+R0), VS32 // load 16 bytes + LXVD2X (R4+R8), VS33 + ADD $32, R4 + STXVD2X VS32, (R3+R0) // store 16 bytes + STXVD2X VS33, (R3+R8) ADD $32,R3 // bump up for next set BC 16, 0, forward32 // continue RLDCLCC $61,R5,$3,R6 // remaining doublewords @@ -71,7 +68,7 @@ forward32: noforwardlarge: CMP R7,$0 // any remaining bytes - BC 4, 1, LR + BC 4, 1, LR // ble lr forwardtail: MOVD R7, CTR // move tail bytes @@ -101,19 +98,39 @@ backwardtailloop: SUB $1,R4 MOVBZ R8, -1(R3) SUB $1,R3 - BC 16, 0, backwardtailloop + BC 16, 0, backwardtailloop // bndz nobackwardtail: - CMP R6,$0 - BC 4, 5, LR + BC 4, 5, LR // ble CR1 lr backwardlarge: MOVD R6, CTR + SUB R3, R4, R9 // Use vsx if moving + CMP R9, $32 // at least 32 byte chunks + BLT backwardlargeloop // and distance >= 32 + SRDCC $2,R6,R8 // 32 byte chunks + BNE backward32setup backwardlargeloop: MOVD -8(R4), R8 SUB $8,R4 MOVD R8, -8(R3) SUB $8,R3 - BC 16, 0, backwardlargeloop // + BC 16, 0, backwardlargeloop // bndz RET + +backward32setup: + MOVD R8, CTR // set up loop ctr + MOVD $16, R8 // 32 bytes at at time + +backward32loop: + SUB $32, R4 + SUB $32, R3 + LXVD2X (R4+R0), VS32 // load 16 bytes + LXVD2X (R4+R8), VS33 + STXVD2X VS32, (R3+R0) // store 16 bytes + STXVD2X VS33, (R3+R8) + BC 16, 0, backward32loop // bndz + BC 4, 5, LR // ble CR1 lr + MOVD R6, CTR + BR backwardlargeloop