diff --git a/src/runtime/memmove_ppc64x.s b/src/runtime/memmove_ppc64x.s index dbb3b90fcf..edc6452bba 100644 --- a/src/runtime/memmove_ppc64x.s +++ b/src/runtime/memmove_ppc64x.s @@ -9,78 +9,112 @@ // See memmove Go doc for important implementation constraints. // func memmove(to, from unsafe.Pointer, n uintptr) + +// target address +#define TGT R3 +// source address +#define SRC R4 +// length to move +#define LEN R5 +// number of doublewords +#define DWORDS R6 +// number of bytes < 8 +#define BYTES R7 +// const 16 used as index +#define IDX16 R8 +// temp used for copies, etc. +#define TMP R9 +// number of 32 byte chunks +#define QWORDS R10 + TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24 - MOVD to+0(FP), R3 - MOVD from+8(FP), R4 - MOVD n+16(FP), R5 + MOVD to+0(FP), TGT + MOVD from+8(FP), SRC + MOVD n+16(FP), LEN // Determine if there are doublewords to // copy so a more efficient move can be done check: - ANDCC $7, R5, R7 // R7: bytes to copy - SRD $3, R5, R6 // R6: double words to copy - CMP R6, $0, CR1 // CR1[EQ] set if no double words to copy + ANDCC $7, LEN, BYTES // R7: bytes to copy + SRD $3, LEN, DWORDS // R6: double words to copy + MOVFL CR0, CR3 // save CR from ANDCC + CMP DWORDS, $0, CR1 // CR1[EQ] set if no double words to copy // Determine overlap by subtracting dest - src and comparing against the - // length. The catches the cases where src and dest are in different types + // length. This catches the cases where src and dest are in different types // of storage such as stack and static to avoid doing backward move when not // necessary. - SUB R4, R3, R8 // dest - src - CMPU R8, R5, CR2 // < len? + SUB SRC, TGT, TMP // dest - src + CMPU TMP, LEN, CR2 // < len? BC 12, 8, backward // BLT CR2 backward // Copying forward if no overlap. - BC 12, 6, noforwardlarge // "BEQ CR1, noforwardlarge" - SRDCC $2,R6,R8 // 32 byte chunks? - BNE forward32setup // - MOVD R6,CTR // R6 = number of double words - - // Move double words - -forward8: - MOVD 0(R4), R8 // double word - ADD $8,R4 - MOVD R8, 0(R3) // - ADD $8,R3 - BC 16, 0, forward8 - BR noforwardlarge // handle remainder + BC 12, 6, checkbytes // BEQ CR1, checkbytes + SRDCC $2, DWORDS, QWORDS // 32 byte chunks? + BEQ lt32gt8 // < 32 bytes // Prepare for moves of 32 bytes at a time. forward32setup: - DCBTST (R3) // prepare data cache - DCBT (R4) - MOVD R8, CTR // double work count - MOVD $16, R8 + DCBTST (TGT) // prepare data cache + DCBT (SRC) + MOVD QWORDS, CTR // Number of 32 byte chunks + MOVD $16, IDX16 // 16 for index forward32: - LXVD2X (R4+R0), VS32 // load 16 bytes - LXVD2X (R4+R8), VS33 - ADD $32, R4 - STXVD2X VS32, (R3+R0) // store 16 bytes - STXVD2X VS33, (R3+R8) - ADD $32,R3 // bump up for next set + LXVD2X (R0)(SRC), VS32 // load 16 bytes + LXVD2X (IDX16)(SRC), VS33 // load 16 bytes + ADD $32, SRC + STXVD2X VS32, (R0)(TGT) // store 16 bytes + STXVD2X VS33, (IDX16)(TGT) + ADD $32,TGT // bump up for next set BC 16, 0, forward32 // continue - RLDCLCC $61,R5,$3,R6 // remaining doublewords - BEQ noforwardlarge - MOVD R6,CTR // set up the CTR - BR forward8 + ANDCC $3, DWORDS // remaining doublewords + BEQ checkbytes // only bytes remain -noforwardlarge: - CMP R7,$0 // any remaining bytes - BC 4, 1, LR // ble lr +lt32gt8: + // At this point >= 8 and < 32 + // Move 16 bytes if possible + CMP DWORDS, $2 + BLT lt16 + LXVD2X (R0)(SRC), VS32 + ADD $-2, DWORDS + STXVD2X VS32, (R0)(TGT) + ADD $16, SRC + ADD $16, TGT -forwardtail: - MOVD R7, CTR // move tail bytes - -forwardtailloop: - MOVBZ 0(R4), R8 // move single bytes - ADD $1,R4 - MOVBZ R8, 0(R3) - ADD $1,R3 - BC 16, 0, forwardtailloop +lt16: // Move 8 bytes if possible + CMP DWORDS, $1 + BLT checkbytes + MOVD 0(SRC), TMP + ADD $8, SRC + MOVD TMP, 0(TGT) + ADD $8, TGT +checkbytes: + BC 12, 14, LR // BEQ lr +lt8: // Move word if possible + CMP BYTES, $4 + BLT lt4 + MOVWZ 0(SRC), TMP + ADD $-4, BYTES + MOVW TMP, 0(TGT) + ADD $4, SRC + ADD $4, TGT +lt4: // Move halfword if possible + CMP BYTES, $2 + BLT lt2 + MOVHZ 0(SRC), TMP + ADD $-2, BYTES + MOVH TMP, 0(TGT) + ADD $2, SRC + ADD $2, TGT +lt2: // Move last byte if 1 left + CMP BYTES, $1 + BC 12, 0, LR // ble lr + MOVBZ 0(SRC), TMP + MOVBZ TMP, 0(TGT) RET backward: @@ -88,51 +122,51 @@ backward: // R3 and R4 are advanced to the end of the destination/source buffers // respectively and moved back as we copy. - ADD R5, R4, R4 // end of source - ADD R3, R5, R3 // end of dest + ADD LEN, SRC, SRC // end of source + ADD TGT, LEN, TGT // end of dest BEQ nobackwardtail // earlier condition - MOVD R7, CTR // bytes to move + MOVD BYTES, CTR // bytes to move backwardtailloop: - MOVBZ -1(R4), R8 // point to last byte - SUB $1,R4 - MOVBZ R8, -1(R3) - SUB $1,R3 + MOVBZ -1(SRC), TMP // point to last byte + SUB $1,SRC + MOVBZ TMP, -1(TGT) + SUB $1,TGT BC 16, 0, backwardtailloop // bndz nobackwardtail: BC 4, 5, LR // ble CR1 lr backwardlarge: - MOVD R6, CTR - SUB R3, R4, R9 // Use vsx if moving - CMP R9, $32 // at least 32 byte chunks + MOVD DWORDS, CTR + SUB TGT, SRC, TMP // Use vsx if moving + CMP TMP, $32 // at least 32 byte chunks BLT backwardlargeloop // and distance >= 32 - SRDCC $2,R6,R8 // 32 byte chunks + SRDCC $2,DWORDS,QWORDS // 32 byte chunks BNE backward32setup backwardlargeloop: - MOVD -8(R4), R8 - SUB $8,R4 - MOVD R8, -8(R3) - SUB $8,R3 + MOVD -8(SRC), TMP + SUB $8,SRC + MOVD TMP, -8(TGT) + SUB $8,TGT BC 16, 0, backwardlargeloop // bndz RET backward32setup: - MOVD R8, CTR // set up loop ctr - MOVD $16, R8 // 32 bytes at at time + MOVD QWORDS, CTR // set up loop ctr + MOVD $16, IDX16 // 32 bytes at at time backward32loop: - SUB $32, R4 - SUB $32, R3 - LXVD2X (R4+R0), VS32 // load 16 bytes - LXVD2X (R4+R8), VS33 - STXVD2X VS32, (R3+R0) // store 16 bytes - STXVD2X VS33, (R3+R8) + SUB $32, TGT + SUB $32, SRC + LXVD2X (R0)(TGT), VS32 // load 16 bytes + LXVD2X (IDX16)(TGT), VS33 + STXVD2X VS32, (R0)(SRC) // store 16 bytes + STXVD2X VS33, (IDX16)(SRC) BC 16, 0, backward32loop // bndz BC 4, 5, LR // ble CR1 lr - MOVD R6, CTR + MOVD DWORDS, CTR BR backwardlargeloop