diff --git a/src/math/big/arith_ppc64x.s b/src/math/big/arith_ppc64x.s index 5ed3de68e2..74db48933f 100644 --- a/src/math/big/arith_ppc64x.s +++ b/src/math/big/arith_ppc64x.s @@ -98,103 +98,59 @@ TEXT ·shrVU(SB), NOSPLIT, $0 // func mulAddVWW(z, x []Word, y, r Word) (c Word) TEXT ·mulAddVWW(SB), NOSPLIT, $0 - MOVD z+0(FP), R10 - MOVD x+24(FP), R8 - MOVD y+48(FP), R9 - MOVD r+56(FP), R4 // c = r - MOVD z_len+8(FP), R11 - MOVD $0, R3 // i = 0 - MOVD $8, R18 - MOVD $1, R19 + MOVD z+0(FP), R10 // R10 = z[] + MOVD x+24(FP), R8 // R8 = x[] + MOVD y+48(FP), R9 // R9 = y + MOVD r+56(FP), R4 // R4 = r = c + MOVD z_len+8(FP), R11 // R11 = z_len - JMP e5 + MOVD R0, R3 // R3 will be the index register + CMP R0, R11 + MOVD R11, CTR // Initialize loop counter + BEQ done -l5: - MULLD R18, R3, R5 - MOVD (R8)(R5), R20 - MULLD R9, R20, R6 - MULHDU R9, R20, R7 - ADDC R4, R6 +loop: + MOVD (R8)(R3), R20 // x[i] + MULLD R9, R20, R6 // R6 = z0 = Low-order(x[i]*y) + MULHDU R9, R20, R7 // R7 = z1 = High-order(x[i]*y) + ADDC R4, R6 // Compute sum for z1 and z0 ADDZE R7 - MOVD R6, (R10)(R5) - MOVD R7, R4 - ADD R19, R3 - -e5: - CMP R3, R11 - BLT l5 + MOVD R6, (R10)(R3) // z[i] + MOVD R7, R4 // c + ADD $8, R3 + BC 16, 0, loop // bdnz +done: MOVD R4, c+64(FP) RET // func addMulVVW(z, x []Word, y Word) (c Word) TEXT ·addMulVVW(SB), NOSPLIT, $0 - MOVD z+0(FP), R10 - MOVD x+24(FP), R8 - MOVD y+48(FP), R9 - MOVD z_len+8(FP), R22 + MOVD z+0(FP), R10 // R10 = z[] + MOVD x+24(FP), R8 // R8 = x[] + MOVD y+48(FP), R9 // R9 = y + MOVD z_len+8(FP), R22 // R22 = z_len - MOVD $0, R5 // i = 0 - MOVD $0, R4 // c = 0 - MOVD $8, R28 - MOVD $-2, R23 - AND R22, R23 // mask the last bit of z.len - MOVD $2, R24 - CMP R23, R24 - BGE unrolled - JMP end - -unrolled: - MOVD $8, R19 // no (RA)(RB*8) on power - MULLD R5, R19 - MOVD (R10)(R19), R11 // R11 = z[i] - MOVD (R8)(R19), R16 // R16 = x[i] - ADD R28, R19, R25 - MOVD (R10)(R25), R17 - MOVD (R8)(R25), R18 - - MULLD R9, R16, R12 - MULHDU R9, R16, R14 - MULLD R9, R18, R6 - MULHDU R9, R18, R7 - ADDC R4, R12 - ADDZE R14 - ADDC R11, R12 // z[i] = (x[i]*y) + z[i] + carry - ADDZE R14 // carry = high order bits + add carry - MOVD R12, (R10)(R19) - ADDC R14, R6 - ADDZE R7 - ADDC R17, R6 - ADDZE R7 - MOVD R6, (R10)(R25) - MOVD R7, R4 - - ADD R24, R5 - CMP R5, R23 - BLT unrolled - JMP end + MOVD R0, R3 // R3 will be the index register + CMP R0, R22 + MOVD R0, R4 // R4 = c = 0 + MOVD R22, CTR // Initialize loop counter + BEQ done loop: - MOVD $8, R19 - MULLD R5, R19 - MOVD (R10)(R19), R11 - MOVD (R8)(R19), R16 - MULLD R9, R16, R12 - MULHDU R9, R16, R14 - ADDC R4, R12 - ADDZE R14 - ADDC R11, R12 - ADDZE R14 - MOVD R12, (R10)(R19) - MOVD R14, R4 - - MOVD $1, R15 - ADD R15, R5 - -end: - CMP R5, R22 - BLT loop + MOVD (R8)(R3), R20 // Load x[i] + MOVD (R10)(R3), R21 // Load z[i] + MULLD R9, R20, R6 // R6 = Low-order(x[i]*y) + MULHDU R9, R20, R7 // R7 = High-order(x[i]*y) + ADDC R21, R6 // R6 = z0 + ADDZE R7 // R7 = z1 + ADDC R4, R6 // R6 = z0 + c + 0 + ADDZE R7, R4 // c += z1 + MOVD R6, (R10)(R3) // Store z[i] + ADD $8, R3 + BC 16, 0, loop // bdnz +done: MOVD R4, c+56(FP) RET