diff --git a/src/crypto/elliptic/fuzz_test.go b/src/crypto/elliptic/fuzz_test.go index eaeed0dacc..b9209a789b 100644 --- a/src/crypto/elliptic/fuzz_test.go +++ b/src/crypto/elliptic/fuzz_test.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build amd64 arm64 +// +build amd64 arm64 ppc64le package elliptic diff --git a/src/crypto/elliptic/p256_asm_ppc64le.s b/src/crypto/elliptic/p256_asm_ppc64le.s new file mode 100644 index 0000000000..4428a18260 --- /dev/null +++ b/src/crypto/elliptic/p256_asm_ppc64le.s @@ -0,0 +1,2517 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +// This is a port of the s390x asm implementation. +// to ppc64le. + +// Some changes were needed due to differences in +// the Go opcodes and/or available instructions +// between s390x and ppc64le. + +// 1. There were operand order differences in the +// VSUBUQM, VSUBCUQ, and VSEL instructions. + +// 2. ppc64 does not have a multiply high and low +// like s390x, so those were implemented using +// macros to compute the equivalent values. + +// 3. The LVX, STVX instructions on ppc64 require +// 16 byte alignment of the data. To avoid that +// requirement, data is loaded using LXVD2X and +// STXVD2X with VPERM to reorder bytes correctly. + +// I have identified some areas where I believe +// changes would be needed to make this work for big +// endian; however additional changes beyond what I +// have noted are most likely needed to make it work. +// - The string used with VPERM to swap the byte order +// for loads and stores. +// - The EXTRACT_HI and EXTRACT_LO strings. +// - The constants that are loaded from CPOOL. +// + +// Permute string used by VPERM to reorder bytes +// loaded or stored using LXVD2X or STXVD2X +// on little endian. +DATA byteswap<>+0(SB)/8, $0x08090a0b0c0d0e0f +DATA byteswap<>+8(SB)/8, $0x0001020304050607 + +// The following constants are defined in an order +// that is correct for use with LXVD2X/STXVD2X +// on little endian. +DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256 +DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256 +DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256 +DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256 +DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 +DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 +DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0 d1 d0 0 +DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0 d1 d0 0 +DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0 +DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0 +DATA p256mul<>+0x00(SB)/8, $0x00000000ffffffff // P256 original +DATA p256mul<>+0x08(SB)/8, $0xffffffffffffffff // P256 +DATA p256mul<>+0x10(SB)/8, $0xffffffff00000001 // P256 original +DATA p256mul<>+0x18(SB)/8, $0x0000000000000000 // P256 +DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0 0 0 d0 +DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0 0 0 d0 +DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0 0 d1 d0 +DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0 0 d1 d0 +DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL 0 d1 d0 d1 +DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL 0 d1 d0 d1 +DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL 0 0 d1 d0 +DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL 0 0 d1 d0 +DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 +DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 +DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0 d1 d0 0 +DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0 d1 d0 0 +DATA p256mul<>+0x80(SB)/8, $0xffffffff00000000 // (1*2^256)%P256 +DATA p256mul<>+0x88(SB)/8, $0x0000000000000001 // (1*2^256)%P256 +DATA p256mul<>+0x90(SB)/8, $0x00000000fffffffe // (1*2^256)%P256 +DATA p256mul<>+0x98(SB)/8, $0xffffffffffffffff // (1*2^256)%P256 + +// The following are used with VPERM to extract the high and low +// values from the intermediate results of a vector multiply. +// They are used in the VMULTxxx macros. These have been tested +// only on little endian, I think they would have to be different +// for big endian. +DATA p256permhilo<>+0x00(SB)/8, $0x0405060714151617 // least significant +DATA p256permhilo<>+0x08(SB)/8, $0x0c0d0e0f1c1d1e1f +DATA p256permhilo<>+0x10(SB)/8, $0x0001020310111213 // most significant +DATA p256permhilo<>+0x18(SB)/8, $0x08090a0b18191A1B + +// External declarations for constants +GLOBL p256ord<>(SB), 8, $32 +GLOBL p256<>(SB), 8, $80 +GLOBL p256mul<>(SB), 8, $160 +GLOBL p256permhilo<>(SB), 8, $32 +GLOBL byteswap<>+0(SB), RODATA, $16 + +// The following macros are used to implement the ppc64le +// equivalent function from the corresponding s390x +// instruction for vector multiply high, low, and add, +// since there aren't exact equivalent instructions. +// The corresponding s390x instructions appear in the +// comments. +// Implementation for big endian would have to be +// investigated, I think it would be different. +// +// Vector multiply low word +// +// VMLF x0, x1, out_low +#define VMULT_LOW(x1, x2, out_low) \ + VMULUWM x1, x2, out_low + + // + // Vector multiply high word + // + // VMLHF x0, x1, out_hi +#define VMULT_HI(x1, x2, out_hi) \ + VMULEUW x1, x2, TMP1; \ + VMULOUW x1, x2, TMP2; \ + VPERM TMP1, TMP2, EXTRACT_HI, out_hi + +// +// Vector multiply word +// +// VMLF x0, x1, out_low +// VMLHF x0, x1, out_hi +#define VMULT(x1, x2, out_low, out_hi) \ + VMULEUW x1, x2, TMP1; \ + VMULOUW x1, x2, TMP2; \ + VPERM TMP1, TMP2, EXTRACT_LO, out_low; \ + VPERM TMP1, TMP2, EXTRACT_HI, out_hi + +// +// Vector multiply add word +// +// VMALF x0, x1, y, out_low +// VMALHF x0, x1, y, out_hi +#define VMULT_ADD(x1, x2, y, out_low, out_hi) \ + VSPLTISW $1, TMP1; \ + VMULEUW y, TMP1, TMP2; \ + VMULOUW y, TMP1, TMP1; \ + VMULEUW x1, x2, out_low; \ + VMULOUW x1, x2, out_hi; \ + VADDUDM TMP1, out_hi, TMP1; \ + VADDUDM TMP2, out_low, TMP2; \ + VPERM TMP2, TMP1, EXTRACT_LO, out_low; \ + VPERM TMP2, TMP1, EXTRACT_HI, out_hi + +// +// Vector multiply add high word +// +// VMALF x0, x1, y, out_low +// VMALHF x0, x1, y, out_hi +#define VMULT_ADD_HI(x1, x2, y, out_low, out_hi) \ + VSPLTISW $1, TMP1; \ + VMULOUW y, TMP1, TMP2; \ + VMULEUW y, TMP1, TMP1; \ + VMULEUW x1, x2, out_hi; \ + VMULOUW x1, x2, out_low; \ + VADDUDM TMP1, out_hi, TMP1; \ + VADDUDM TMP2, out_low, TMP2; \ + VPERM TMP2, TMP1, EXTRACT_HI, out_hi + +// +// Vector multiply add low word +// +// VMALF s0, x1, y, out_low +#define VMULT_ADD_LOW(x1, x2, y, out_low) \ + VMULUWM x1, x2, out_low; \ + VADDUWM out_low, y, out_low + +#define res_ptr R3 +#define a_ptr R4 + +// func p256ReverseBytes(res, in []byte) +// Reuse of target and destination OK +TEXT ·p256ReverseBytes(SB), NOSPLIT, $0-48 + MOVD res+0(FP), res_ptr + MOVD in+24(FP), a_ptr + + MOVD $8, R5 + MOVD $16, R6 + MOVD $24, R7 + + MOVDBR (R0+a_ptr), R8 + MOVDBR (R5+a_ptr), R9 + MOVDBR (R6+a_ptr), R10 + MOVDBR (R7+a_ptr), R11 + + MOVD R11, (R0+res_ptr) + MOVD R10, (R5+res_ptr) + MOVD R9, (R6+res_ptr) + MOVD R8, (R7+res_ptr) + RET + +#undef res_ptr +#undef a_ptr + +// func p256NegCond(val *p256Point, cond int) +#define P1ptr R3 +#define CPOOL R7 + +#define Y1L V0 +#define Y1L_ VS32 +#define Y1H V1 +#define Y1H_ VS33 +#define T1L V2 +#define T1L_ VS34 +#define T1H V3 +#define T1H_ VS35 + +#define SWAP V28 +#define SWAP_ VS60 + +#define PL V30 +#define PL_ VS62 +#define PH V31 +#define PH_ VS63 + +#define SEL1 V5 +#define SEL1_ VS37 +#define CAR1 V6 +// +// iff cond == 1 val <- -val +// +TEXT ·p256NegCond(SB), NOSPLIT, $0-16 + MOVD val+0(FP), P1ptr + MOVD $16, R16 + MOVD $32, R17 + MOVD $48, R18 + MOVD $40, R19 + + MOVD cond+8(FP), R6 + CMP $0, R6 + BC 12, 2, LR // just return if cond == 0 + + MOVD $p256mul<>+0x00(SB), CPOOL + + MOVD $byteswap<>+0x00(SB), R8 + LXVD2X (R8)(R0), SWAP_ + + LXVD2X (P1ptr)(R17), Y1L_ + LXVD2X (P1ptr)(R18), Y1H_ + + VPERM Y1H, Y1H, SWAP, Y1H + VPERM Y1L, Y1L, SWAP, Y1L + + LXVD2X (CPOOL)(R0), PL_ + LXVD2X (CPOOL)(R16), PH_ + + VSUBCUQ PL, Y1L, CAR1 // subtract part2 giving carry + VSUBUQM PL, Y1L, T1L // subtract part2 giving result + VSUBEUQM PH, Y1H, CAR1, T1H // subtract part1 using carry from part2 + + VPERM T1H, T1H, SWAP, T1H + VPERM T1L, T1L, SWAP, T1L + + STXVD2X T1L_, (R17+P1ptr) + STXVD2X T1H_, (R18+P1ptr) + RET + +#undef P1ptr +#undef CPOOL +#undef Y1L +#undef Y1L_ +#undef Y1H +#undef Y1H_ +#undef T1L +#undef T1L_ +#undef T1H +#undef T1H_ +#undef PL +#undef PL_ +#undef PH +#undef PH_ +#undef SEL1 +#undef SEL1_ +#undef CAR1 + +// +// if cond == 0 res <-b else res <-a +// +// func p256MovCond(res, a, b *p256Point, cond int) +#define P3ptr R3 +#define P1ptr R4 +#define P2ptr R5 + +#define FROMptr R7 +#define X1L V0 +#define X1H V1 +#define Y1L V2 +#define Y1H V3 +#define Z1L V4 +#define Z1H V5 +#define X1L_ VS32 +#define X1H_ VS33 +#define Y1L_ VS34 +#define Y1H_ VS35 +#define Z1L_ VS36 +#define Z1H_ VS37 + +// This function uses LXVD2X and STXVD2X to avoid the +// data alignment requirement for LVX, STVX. Since +// this code is just moving bytes and not doing arithmetic, +// order of the bytes doesn't matter. +// +TEXT ·p256MovCond(SB), NOSPLIT, $0-32 + MOVD res+0(FP), P3ptr + MOVD a+8(FP), P1ptr + MOVD b+16(FP), P2ptr + MOVD cond+24(FP), R6 + MOVD $16, R16 + MOVD $32, R17 + MOVD $48, R18 + MOVD $56, R21 + MOVD $64, R19 + MOVD $80, R20 + + // Check the condition + CMP $0, R6 + + // If 0, use b as the source + BEQ FROMB + + // Not 0, use a as the source + MOVD P1ptr, FROMptr + BR LOADVALS + +FROMB: + MOVD P2ptr, FROMptr + +LOADVALS: + // Load from a or b depending on the setting + // of FROMptr + LXVW4X (FROMptr+R0), X1H_ + LXVW4X (FROMptr+R16), X1L_ + LXVW4X (FROMptr+R17), Y1H_ + LXVW4X (FROMptr+R18), Y1L_ + LXVW4X (FROMptr+R19), Z1H_ + LXVW4X (FROMptr+R20), Z1L_ + + STXVW4X X1H_, (P3ptr+R0) + STXVW4X X1L_, (P3ptr+R16) + STXVW4X Y1H_, (P3ptr+R17) + STXVW4X Y1L_, (P3ptr+R18) + STXVW4X Z1H_, (P3ptr+R19) + STXVW4X Z1L_, (P3ptr+R20) + + RET + +#undef P3ptr +#undef P1ptr +#undef P2ptr +#undef FROMptr +#undef X1L +#undef X1H +#undef Y1L +#undef Y1H +#undef Z1L +#undef Z1H +#undef X1L_ +#undef X1H_ +#undef Y1L_ +#undef Y1H_ +#undef Z1L_ +#undef Z1H_ +// +// Select the point from the table for idx +// +// func p256Select(point *p256Point, table []p256Point, idx int) +#define P3ptr R3 +#define P1ptr R4 +#define COUNT R5 + +#define X1L V0 +#define X1H V1 +#define Y1L V2 +#define Y1H V3 +#define Z1L V4 +#define Z1H V5 +#define X1L_ VS32 +#define X1H_ VS33 +#define Y1L_ VS34 +#define Y1H_ VS35 +#define Z1L_ VS36 +#define Z1H_ VS37 +#define X2L V6 +#define X2H V7 +#define Y2L V8 +#define Y2H V9 +#define Z2L V10 +#define Z2H V11 +#define X2L_ VS38 +#define X2H_ VS39 +#define Y2L_ VS40 +#define Y2H_ VS41 +#define Z2L_ VS42 +#define Z2H_ VS43 + +#define ONE V18 +#define IDX V19 +#define SEL1 V20 +#define SEL1_ VS52 +#define SEL2 V21 +// +TEXT ·p256Select(SB), NOSPLIT, $0-40 + MOVD point+0(FP), P3ptr + MOVD table+8(FP), P1ptr + MOVD $16, R16 + MOVD $32, R17 + MOVD $48, R18 + MOVD $64, R19 + MOVD $80, R20 + + LXVDSX (R1)(R19), SEL1_ // VLREPG idx+32(FP), SEL1 + VSPLTB $7, SEL1, IDX // splat byte + VSPLTISB $1, ONE // VREPIB $1, ONE + VSPLTISB $1, SEL2 // VREPIB $1, SEL2 + MOVD $17, COUNT + MOVD COUNT, CTR // set up ctr + + VSPLTISB $0, X1H // VZERO X1H + VSPLTISB $0, X1L // VZERO X1L + VSPLTISB $0, Y1H // VZERO Y1H + VSPLTISB $0, Y1L // VZERO Y1L + VSPLTISB $0, Z1H // VZERO Z1H + VSPLTISB $0, Z1L // VZERO Z1L + +loop_select: + + // LVXD2X is used here since data alignment doesn't + // matter. + + LXVD2X (P1ptr+R0), X2H_ + LXVD2X (P1ptr+R16), X2L_ + LXVD2X (P1ptr+R17), Y2H_ + LXVD2X (P1ptr+R18), Y2L_ + LXVD2X (P1ptr+R19), Z2H_ + LXVD2X (P1ptr+R20), Z2L_ + + VCMPEQUD SEL2, IDX, SEL1 // VCEQG SEL2, IDX, SEL1 OK + + // This will result in SEL1 being all 0s or 1s, meaning + // the result is either X1L or X2L, no individual byte + // selection. + + VSEL X1L, X2L, SEL1, X1L + VSEL X1H, X2H, SEL1, X1H + VSEL Y1L, Y2L, SEL1, Y1L + VSEL Y1H, Y2H, SEL1, Y1H + VSEL Z1L, Z2L, SEL1, Z1L + VSEL Z1H, Z2H, SEL1, Z1H + + // Add 1 to all bytes in SEL2 + VADDUBM SEL2, ONE, SEL2 // VAB SEL2, ONE, SEL2 OK + ADD $96, P1ptr + BC 16, 0, loop_select + + // STXVD2X is used here so that alignment doesn't + // need to be verified. Since values were loaded + // using LXVD2X this is OK. + STXVD2X X1H_, (P3ptr+R0) + STXVD2X X1L_, (P3ptr+R16) + STXVD2X Y1H_, (P3ptr+R17) + STXVD2X Y1L_, (P3ptr+R18) + STXVD2X Z1H_, (P3ptr+R19) + STXVD2X Z1L_, (P3ptr+R20) + RET + +#undef P3ptr +#undef P1ptr +#undef COUNT +#undef X1L +#undef X1H +#undef Y1L +#undef Y1H +#undef Z1L +#undef Z1H +#undef X2L +#undef X2H +#undef Y2L +#undef Y2H +#undef Z2L +#undef Z2H +#undef X2L_ +#undef X2H_ +#undef Y2L_ +#undef Y2H_ +#undef Z2L_ +#undef Z2H_ +#undef ONE +#undef IDX +#undef SEL1 +#undef SEL1_ +#undef SEL2 + +// func p256SelectBase(point, table []uint64, idx int) +#define P3ptr R3 +#define P1ptr R4 +#define COUNT R5 + +#define X1L V0 +#define X1H V1 +#define Y1L V2 +#define Y1H V3 +#define Z1L V4 +#define Z1H V5 +#define X2L V6 +#define X2H V7 +#define Y2L V8 +#define Y2H V9 +#define Z2L V10 +#define Z2H V11 +#define X2L_ VS38 +#define X2H_ VS39 +#define Y2L_ VS40 +#define Y2H_ VS41 +#define Z2L_ VS42 +#define Z2H_ VS43 + +#define ONE V18 +#define IDX V19 +#define SEL1 V20 +#define SEL1_ VS52 +#define SEL2 V21 +TEXT ·p256SelectBase(SB), NOSPLIT, $0-40 + MOVD point+0(FP), P3ptr + MOVD table+8(FP), P1ptr + MOVD $16, R16 + MOVD $32, R17 + MOVD $48, R18 + MOVD $64, R19 + MOVD $80, R20 + MOVD $56, R21 + + LXVDSX (R1)(R19), SEL1_ + VSPLTB $7, SEL1, IDX // splat byte + + VSPLTISB $1, ONE // Vector with byte 1s + VSPLTISB $1, SEL2 // Vector with byte 1s + MOVD $65, COUNT + MOVD COUNT, CTR // loop count + + VSPLTISB $0, X1H // VZERO X1H + VSPLTISB $0, X1L // VZERO X1L + VSPLTISB $0, Y1H // VZERO Y1H + VSPLTISB $0, Y1L // VZERO Y1L + VSPLTISB $0, Z1H // VZERO Z1H + VSPLTISB $0, Z1L // VZERO Z1L + +loop_select: + LXVD2X (P1ptr+R0), X2H_ + LXVD2X (P1ptr+R16), X2L_ + LXVD2X (P1ptr+R17), Y2H_ + LXVD2X (P1ptr+R18), Y2L_ + LXVD2X (P1ptr+R19), Z2H_ + LXVD2X (P1ptr+R20), Z2L_ + + VCMPEQUD SEL2, IDX, SEL1 // Compare against idx + + VSEL X1L, X2L, SEL1, X1L // Select if idx matched + VSEL X1H, X2H, SEL1, X1H + VSEL Y1L, Y2L, SEL1, Y1L + VSEL Y1H, Y2H, SEL1, Y1H + VSEL Z1L, Z2L, SEL1, Z1L + VSEL Z1H, Z2H, SEL1, Z1H + + VADDUBM SEL2, ONE, SEL2 // Increment SEL2 bytes by 1 + ADD $96, P1ptr // Next chunk + BC 16, 0, loop_select + + STXVD2X X1H_, (P3ptr+R0) + STXVD2X X1L_, (P3ptr+R16) + STXVD2X Y1H_, (P3ptr+R17) + STXVD2X Y1L_, (P3ptr+R18) + STXVD2X Z1H_, (P3ptr+R19) + STXVD2X Z1L_, (P3ptr+R20) + RET + +#undef P3ptr +#undef P1ptr +#undef COUNT +#undef X1L +#undef X1H +#undef Y1L +#undef Y1H +#undef Z1L +#undef Z1H +#undef X2L +#undef X2H +#undef Y2L +#undef Y2H +#undef Z2L +#undef Z2H +#undef X1L_ +#undef X1H_ +#undef X2L_ +#undef X2H_ +#undef Y1L_ +#undef Y1H_ +#undef Y2L_ +#undef Y2H_ +#undef Z1L_ +#undef Z1H_ +#undef Z2L_ +#undef Z2H_ +#undef ONE +#undef IDX +#undef SEL1 +#undef SEL1_ +#undef SEL2 +#undef SWAP +#undef SWAP_ + +// --------------------------------------- +// func p256FromMont(res, in []byte) +#define res_ptr R3 +#define x_ptr R4 +#define CPOOL R7 + +#define T0 V0 +#define T0_ VS32 +#define T1 V1 +#define T1_ VS33 +#define T2 V2 +#define TT0 V3 +#define TT1 V4 +#define TT0_ VS35 +#define TT1_ VS36 + +#define ZER V6 +#define SEL1 V7 +#define SEL1_ VS39 +#define SEL2 V8 +#define SEL2_ VS40 +#define CAR1 V9 +#define CAR2 V10 +#define RED1 V11 +#define RED2 V12 +#define PL V13 +#define PL_ VS45 +#define PH V14 +#define PH_ VS46 +#define SWAP V28 +#define SWAP_ VS57 + +TEXT ·p256FromMont(SB), NOSPLIT, $0-48 + MOVD res+0(FP), res_ptr + MOVD in+24(FP), x_ptr + + MOVD $16, R16 + MOVD $32, R17 + MOVD $48, R18 + MOVD $64, R19 + MOVD $p256<>+0x00(SB), CPOOL + MOVD $byteswap<>+0x00(SB), R15 + + VSPLTISB $0, T2 // VZERO T2 + VSPLTISB $0, ZER // VZERO ZER + + // Constants are defined so that the LXVD2X is correct + LXVD2X (CPOOL+R0), PH_ + LXVD2X (CPOOL+R16), PL_ + + // VPERM byte selections + LXVD2X (CPOOL+R18), SEL2_ + LXVD2X (CPOOL+R19), SEL1_ + + LXVD2X (R15)(R0), SWAP_ + + LXVD2X (R16)(x_ptr), T1_ + LXVD2X (R0)(x_ptr), T0_ + + // Put in true little endian order + VPERM T0, T0, SWAP, T0 + VPERM T1, T1, SWAP, T1 + + // First round + VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 + VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 + VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow + + VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0 + VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1 + + VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1 + VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0 + VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2 + VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1 + VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2 + + // Second round + VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 + VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 + VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow + + VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0 + VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1 + + VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1 + VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0 + VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2 + VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1 + VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2 + + // Third round + VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 + VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 + VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow + + VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0 + VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1 + + VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1 + VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0 + VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2 + VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1 + VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2 + + // Last round + VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 + VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 + VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow + + VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0 + VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1 + + VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1 + VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0 + VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2 + VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1 + VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2 + + // --------------------------------------------------- + + VSUBCUQ T0, PL, CAR1 // VSCBIQ PL, T0, CAR1 + VSUBUQM T0, PL, TT0 // VSQ PL, T0, TT0 + VSUBECUQ T1, PH, CAR1, CAR2 // VSBCBIQ T1, PH, CAR1, CAR2 + VSUBEUQM T1, PH, CAR1, TT1 // VSBIQ T1, PH, CAR1, TT1 + VSUBEUQM T2, ZER, CAR2, T2 // VSBIQ T2, ZER, CAR2, T2 + + VSEL TT0, T0, T2, T0 + VSEL TT1, T1, T2, T1 + + // Reorder the bytes so STXVD2X can be used. + // TT0, TT1 used for VPERM result in case + // the caller expects T0, T1 to be good. + VPERM T0, T0, SWAP, TT0 + VPERM T1, T1, SWAP, TT1 + + STXVD2X TT0_, (R0)(res_ptr) + STXVD2X TT1_, (R16)(res_ptr) + RET + +#undef res_ptr +#undef x_ptr +#undef CPOOL +#undef T0 +#undef T0_ +#undef T1 +#undef T1_ +#undef T2 +#undef TT0 +#undef TT1 +#undef ZER +#undef SEL1 +#undef SEL1_ +#undef SEL2 +#undef SEL2_ +#undef CAR1 +#undef CAR2 +#undef RED1 +#undef RED2 +#undef PL +#undef PL_ +#undef PH +#undef PH_ +#undef SWAP +#undef SWAP_ + +// --------------------------------------- +// p256MulInternal +// V0-V3 V30,V31 - Not Modified +// V4-V15 V27-V29 - Volatile + +#define CPOOL R7 + +// Parameters +#define X0 V0 // Not modified +#define X1 V1 // Not modified +#define Y0 V2 // Not modified +#define Y1 V3 // Not modified +#define T0 V4 // Result +#define T1 V5 // Result +#define P0 V30 // Not modified +#define P1 V31 // Not modified + +// Temporaries: lots of reused vector regs +#define YDIG V6 // Overloaded with CAR2 +#define ADD1H V7 // Overloaded with ADD3H +#define ADD2H V8 // Overloaded with ADD4H +#define ADD3 V9 // Overloaded with SEL2,SEL5 +#define ADD4 V10 // Overloaded with SEL3,SEL6 +#define RED1 V11 // Overloaded with CAR2 +#define RED2 V12 +#define RED3 V13 // Overloaded with SEL1 +#define T2 V14 +// Overloaded temporaries +#define ADD1 V4 // Overloaded with T0 +#define ADD2 V5 // Overloaded with T1 +#define ADD3H V7 // Overloaded with ADD1H +#define ADD4H V8 // Overloaded with ADD2H +#define ZER V28 // Overloaded with TMP1 +#define CAR1 V6 // Overloaded with YDIG +#define CAR2 V11 // Overloaded with RED1 +// Constant Selects +#define SEL1 V13 // Overloaded with RED3 +#define SEL2 V9 // Overloaded with ADD3,SEL5 +#define SEL3 V10 // Overloaded with ADD4,SEL6 +#define SEL4 V6 // Overloaded with YDIG,CAR1 +#define SEL5 V9 // Overloaded with ADD3,SEL2 +#define SEL6 V10 // Overloaded with ADD4,SEL3 +#define SEL1_ VS45 +#define SEL2_ VS41 +#define SEL3_ VS42 +#define SEL4_ VS38 +#define SEL5_ VS41 +#define SEL6_ VS42 + +// TMP1, TMP2, EXTRACT_LO, EXTRACT_HI used in +// VMULT macros +#define TMP1 V13 // Overloaded with RED3 +#define TMP2 V27 +#define EVENODD R5 +#define EXTRACT_LO V28 +#define EXTRACT_LO_ VS60 +#define EXTRACT_HI V29 +#define EXTRACT_HI_ VS61 + +/* * + * To follow the flow of bits, for your own sanity a stiff drink, need you shall. + * Of a single round, a 'helpful' picture, here is. Meaning, column position has. + * With you, SIMD be... + * + * +--------+--------+ + * +--------| RED2 | RED1 | + * | +--------+--------+ + * | ---+--------+--------+ + * | +---- T2| T1 | T0 |--+ + * | | ---+--------+--------+ | + * | | | + * | | ======================= | + * | | | + * | | +--------+--------+<-+ + * | +-------| ADD2 | ADD1 |--|-----+ + * | | +--------+--------+ | | + * | | +--------+--------+<---+ | + * | | | ADD2H | ADD1H |--+ | + * | | +--------+--------+ | | + * | | +--------+--------+<-+ | + * | | | ADD4 | ADD3 |--|-+ | + * | | +--------+--------+ | | | + * | | +--------+--------+<---+ | | + * | | | ADD4H | ADD3H |------|-+ |(+vzero) + * | | +--------+--------+ | | V + * | | ------------------------ | | +--------+ + * | | | | | RED3 | [d0 0 0 d0] + * | | | | +--------+ + * | +---->+--------+--------+ | | | + * (T2[1w]||ADD2[4w]||ADD1[3w]) +--------| T1 | T0 | | | | + * | +--------+--------+ | | | + * +---->---+--------+--------+ | | | + * T2| T1 | T0 |----+ | | + * ---+--------+--------+ | | | + * ---+--------+--------+<---+ | | + * +--- T2| T1 | T0 |----------+ + * | ---+--------+--------+ | | + * | +--------+--------+<-------------+ + * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0] + * | +--------+--------+ | | | + * | +--------+<----------------------+ + * | | RED3 |--------------+ | [0 0 d1 d0] + * | +--------+ | | + * +--->+--------+--------+ | | + * | T1 | T0 |--------+ + * +--------+--------+ | | + * --------------------------- | | + * | | + * +--------+--------+<----+ | + * | RED2 | RED1 | | + * +--------+--------+ | + * ---+--------+--------+<-------+ + * T2| T1 | T0 | (H1P-H1P-H00RRAY!) + * ---+--------+--------+ + * + * *Mi obra de arte de siglo XXI @vpaprots + * + * + * First group is special, doesnt get the two inputs: + * +--------+--------+<-+ + * +-------| ADD2 | ADD1 |--|-----+ + * | +--------+--------+ | | + * | +--------+--------+<---+ | + * | | ADD2H | ADD1H |--+ | + * | +--------+--------+ | | + * | +--------+--------+<-+ | + * | | ADD4 | ADD3 |--|-+ | + * | +--------+--------+ | | | + * | +--------+--------+<---+ | | + * | | ADD4H | ADD3H |------|-+ |(+vzero) + * | +--------+--------+ | | V + * | ------------------------ | | +--------+ + * | | | | RED3 | [d0 0 0 d0] + * | | | +--------+ + * +---->+--------+--------+ | | | + * (T2[1w]||ADD2[4w]||ADD1[3w]) | T1 | T0 |----+ | | + * +--------+--------+ | | | + * ---+--------+--------+<---+ | | + * +--- T2| T1 | T0 |----------+ + * | ---+--------+--------+ | | + * | +--------+--------+<-------------+ + * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0] + * | +--------+--------+ | | | + * | +--------+<----------------------+ + * | | RED3 |--------------+ | [0 0 d1 d0] + * | +--------+ | | + * +--->+--------+--------+ | | + * | T1 | T0 |--------+ + * +--------+--------+ | | + * --------------------------- | | + * | | + * +--------+--------+<----+ | + * | RED2 | RED1 | | + * +--------+--------+ | + * ---+--------+--------+<-------+ + * T2| T1 | T0 | (H1P-H1P-H00RRAY!) + * ---+--------+--------+ + * + * Last 'group' needs to RED2||RED1 shifted less + */ +TEXT p256MulInternal<>(SB), NOSPLIT, $0-16 + // CPOOL loaded from caller + MOVD $16, R16 + MOVD $32, R17 + MOVD $48, R18 + MOVD $64, R19 + MOVD $80, R20 + MOVD $96, R21 + MOVD $112, R22 + + MOVD $p256permhilo<>+0x00(SB), EVENODD + + // These values are used by the VMULTxxx macros to + // extract the high and low portions of the intermediate + // result. + LXVD2X (R0)(EVENODD), EXTRACT_LO_ + LXVD2X (R16)(EVENODD), EXTRACT_HI_ + + // --------------------------------------------------- + + VSPLTW $3, Y0, YDIG // VREPF Y0 is input + + // VMLHF X0, YDIG, ADD1H + // VMLHF X1, YDIG, ADD2H + // VMLF X0, YDIG, ADD1 + // VMLF X1, YDIG, ADD2 + // + VMULT(X0, YDIG, ADD1, ADD1H) + VMULT(X1, YDIG, ADD2, ADD2H) + + VSPLTW $2, Y0, YDIG // VREPF + + // VMALF X0, YDIG, ADD1H, ADD3 + // VMALF X1, YDIG, ADD2H, ADD4 + // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free + // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free + VMULT_ADD(X0, YDIG, ADD1H, ADD3, ADD3H) + VMULT_ADD(X1, YDIG, ADD2H, ADD4, ADD4H) + + LXVD2X (R17)(CPOOL), SEL1_ + VSPLTISB $0, ZER // VZERO ZER + VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] + + VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free // VSLDB + VSLDOI $12, ZER, ADD2, T1 // ADD2 Free // VSLDB + + VADDCUQ T0, ADD3, CAR1 // VACCQ + VADDUQM T0, ADD3, T0 // ADD3 Free // VAQ + VADDECUQ T1, ADD4, CAR1, T2 // VACCCQ + VADDEUQM T1, ADD4, CAR1, T1 // ADD4 Free // VACQ + + LXVD2X (R18)(CPOOL), SEL2_ + LXVD2X (R19)(CPOOL), SEL3_ + LXVD2X (R20)(CPOOL), SEL4_ + VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] + VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] + VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] + VSUBUQM RED2, RED3, RED2 // Guaranteed not to underflow -->? // VSQ + + VSLDOI $12, T1, T0, T0 // VSLDB + VSLDOI $12, T2, T1, T1 // VSLDB + + VADDCUQ T0, ADD3H, CAR1 // VACCQ + VADDUQM T0, ADD3H, T0 // VAQ + VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ + VADDEUQM T1, ADD4H, CAR1, T1 // VACQ + + // --------------------------------------------------- + + VSPLTW $1, Y0, YDIG // VREPF + LXVD2X (R0)(EVENODD), EXTRACT_LO_ + LXVD2X (R16)(EVENODD), EXTRACT_HI_ + + // VMALHF X0, YDIG, T0, ADD1H + // VMALHF X1, YDIG, T1, ADD2H + // VMALF X0, YDIG, T0, ADD1 // T0 Free->ADD1 + // VMALF X1, YDIG, T1, ADD2 // T1 Free->ADD2 + VMULT_ADD(X0, YDIG, T0, ADD1, ADD1H) + VMULT_ADD(X1, YDIG, T1, ADD2, ADD2H) + + VSPLTW $0, Y0, YDIG // VREPF + + // VMALF X0, YDIG, ADD1H, ADD3 + // VMALF X1, YDIG, ADD2H, ADD4 + // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H + // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER + VMULT_ADD(X0, YDIG, ADD1H, ADD3, ADD3H) + VMULT_ADD(X1, YDIG, ADD2H, ADD4, ADD4H) + + VSPLTISB $0, ZER // VZERO ZER + LXVD2X (R17)(CPOOL), SEL1_ + VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] + + VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free->T0 // VSLDB + VSLDOI $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free // VSLDB + + VADDCUQ T0, RED1, CAR1 // VACCQ + VADDUQM T0, RED1, T0 // VAQ + VADDECUQ T1, RED2, CAR1, T2 // VACCCQ + VADDEUQM T1, RED2, CAR1, T1 // VACQ + + VADDCUQ T0, ADD3, CAR1 // VACCQ + VADDUQM T0, ADD3, T0 // VAQ + VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ + VADDEUQM T1, ADD4, CAR1, T1 // VACQ + VADDUQM T2, CAR2, T2 // VAQ + + LXVD2X (R18)(CPOOL), SEL2_ + LXVD2X (R19)(CPOOL), SEL3_ + LXVD2X (R20)(CPOOL), SEL4_ + VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] + VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] + VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] + VSUBUQM RED2, RED3, RED2 // Guaranteed not to underflow // VSQ + + VSLDOI $12, T1, T0, T0 // VSLDB + VSLDOI $12, T2, T1, T1 // VSLDB + + VADDCUQ T0, ADD3H, CAR1 // VACCQ + VADDUQM T0, ADD3H, T0 // VAQ + VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ + VADDEUQM T1, ADD4H, CAR1, T1 // VACQ + + // --------------------------------------------------- + + VSPLTW $3, Y1, YDIG // VREPF + LXVD2X (R0)(EVENODD), EXTRACT_LO_ + LXVD2X (R16)(EVENODD), EXTRACT_HI_ + + // VMALHF X0, YDIG, T0, ADD1H + // VMALHF X1, YDIG, T1, ADD2H + // VMALF X0, YDIG, T0, ADD1 + // VMALF X1, YDIG, T1, ADD2 + VMULT_ADD(X0, YDIG, T0, ADD1, ADD1H) + VMULT_ADD(X1, YDIG, T1, ADD2, ADD2H) + + VSPLTW $2, Y1, YDIG // VREPF + + // VMALF X0, YDIG, ADD1H, ADD3 + // VMALF X1, YDIG, ADD2H, ADD4 + // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free + // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free + VMULT_ADD(X0, YDIG, ADD1H, ADD3, ADD3H) + VMULT_ADD(X1, YDIG, ADD2H, ADD4, ADD4H) + + LXVD2X (R17)(CPOOL), SEL1_ + VSPLTISB $0, ZER // VZERO ZER + LXVD2X (R17)(CPOOL), SEL1_ + VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] + + VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free // VSLDB + VSLDOI $12, T2, ADD2, T1 // ADD2 Free // VSLDB + + VADDCUQ T0, RED1, CAR1 // VACCQ + VADDUQM T0, RED1, T0 // VAQ + VADDECUQ T1, RED2, CAR1, T2 // VACCCQ + VADDEUQM T1, RED2, CAR1, T1 // VACQ + + VADDCUQ T0, ADD3, CAR1 // VACCQ + VADDUQM T0, ADD3, T0 // VAQ + VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ + VADDEUQM T1, ADD4, CAR1, T1 // VACQ + VADDUQM T2, CAR2, T2 // VAQ + + LXVD2X (R18)(CPOOL), SEL2_ + LXVD2X (R19)(CPOOL), SEL3_ + LXVD2X (R20)(CPOOL), SEL4_ + VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] + VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] + VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] + VSUBUQM RED2, RED3, RED2 // Guaranteed not to underflow // VSQ + + VSLDOI $12, T1, T0, T0 // VSLDB + VSLDOI $12, T2, T1, T1 // VSLDB + + VADDCUQ T0, ADD3H, CAR1 // VACCQ + VADDUQM T0, ADD3H, T0 // VAQ + VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ + VADDEUQM T1, ADD4H, CAR1, T1 // VACQ + + // --------------------------------------------------- + + VSPLTW $1, Y1, YDIG // VREPF + LXVD2X (R0)(EVENODD), EXTRACT_LO_ + LXVD2X (R16)(EVENODD), EXTRACT_HI_ + + // VMALHF X0, YDIG, T0, ADD1H + // VMALHF X1, YDIG, T1, ADD2H + // VMALF X0, YDIG, T0, ADD1 + // VMALF X1, YDIG, T1, ADD2 + VMULT_ADD(X0, YDIG, T0, ADD1, ADD1H) + VMULT_ADD(X1, YDIG, T1, ADD2, ADD2H) + + VSPLTW $0, Y1, YDIG // VREPF + + // VMALF X0, YDIG, ADD1H, ADD3 + // VMALF X1, YDIG, ADD2H, ADD4 + // VMALHF X0, YDIG, ADD1H, ADD3H + // VMALHF X1, YDIG, ADD2H, ADD4H + VMULT_ADD(X0, YDIG, ADD1H, ADD3, ADD3H) + VMULT_ADD(X1, YDIG, ADD2H, ADD4, ADD4H) + + VSPLTISB $0, ZER // VZERO ZER + LXVD2X (R17)(CPOOL), SEL1_ + VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] + + VSLDOI $12, ADD2, ADD1, T0 // VSLDB + VSLDOI $12, T2, ADD2, T1 // VSLDB + + VADDCUQ T0, RED1, CAR1 // VACCQ + VADDUQM T0, RED1, T0 // VAQ + VADDECUQ T1, RED2, CAR1, T2 // VACCCQ + VADDEUQM T1, RED2, CAR1, T1 // VACQ + + VADDCUQ T0, ADD3, CAR1 // VACCQ + VADDUQM T0, ADD3, T0 // VAQ + VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ + VADDEUQM T1, ADD4, CAR1, T1 // VACQ + VADDUQM T2, CAR2, T2 // VAQ + + LXVD2X (R21)(CPOOL), SEL5_ + LXVD2X (R22)(CPOOL), SEL6_ + VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0] + VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0 0] + VSUBUQM RED2, RED1, RED2 // Guaranteed not to underflow // VSQ + + VSLDOI $12, T1, T0, T0 // VSLDB + VSLDOI $12, T2, T1, T1 // VSLDB + + VADDCUQ T0, ADD3H, CAR1 // VACCQ + VADDUQM T0, ADD3H, T0 // VAQ + VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ + VADDEUQM T1, ADD4H, CAR1, T1 // VACQ + + VADDCUQ T0, RED1, CAR1 // VACCQ + VADDUQM T0, RED1, T0 // VAQ + VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ + VADDEUQM T1, RED2, CAR1, T1 // VACQ + VADDUQM T2, CAR2, T2 // VAQ + + // --------------------------------------------------- + + VSPLTISB $0, RED3 // VZERO RED3 + VSUBCUQ T0, P0, CAR1 // VSCBIQ + VSUBUQM T0, P0, ADD1H // VSQ + VSUBECUQ T1, P1, CAR1, CAR2 // VSBCBIQ + VSUBEUQM T1, P1, CAR1, ADD2H // VSBIQ + VSUBEUQM T2, RED3, CAR2, T2 // VSBIQ + + // what output to use, ADD2H||ADD1H or T1||T0? + VSEL ADD1H, T0, T2, T0 + VSEL ADD2H, T1, T2, T1 + RET + +#undef CPOOL + +#undef X0 +#undef X1 +#undef Y0 +#undef Y1 +#undef T0 +#undef T1 +#undef P0 +#undef P1 + +#undef SEL1 +#undef SEL2 +#undef SEL3 +#undef SEL4 +#undef SEL5 +#undef SEL6 +#undef SEL1_ +#undef SEL2_ +#undef SEL3_ +#undef SEL4_ +#undef SEL5_ +#undef SEL6_ + +#undef YDIG +#undef ADD1H +#undef ADD2H +#undef ADD3 +#undef ADD4 +#undef RED1 +#undef RED2 +#undef RED3 +#undef T2 +#undef ADD1 +#undef ADD2 +#undef ADD3H +#undef ADD4H +#undef ZER +#undef CAR1 +#undef CAR2 + +#undef TMP1 +#undef TMP2 +#undef EVENODD +#undef EXTRACT_HI +#undef EXTRACT_HI_ +#undef EXTRACT_LO +#undef EXTRACT_LO_ + +#define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \ + VSPLTISB $0, ZER \ // VZERO + VSUBCUQ X0, Y0, CAR1 \ + VSUBUQM X0, Y0, T0 \ + VSUBECUQ X1, Y1, CAR1, SEL1 \ + VSUBEUQM X1, Y1, CAR1, T1 \ + VSUBUQM ZER, SEL1, SEL1 \ // VSQ + \ + VADDCUQ T0, PL, CAR1 \ // VACCQ + VADDUQM T0, PL, TT0 \ // VAQ + VADDEUQM T1, PH, CAR1, TT1 \ // VACQ + \ + VSEL TT0, T0, SEL1, T0 \ + VSEL TT1, T1, SEL1, T1 \ + +#define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \ + VADDCUQ X0, Y0, CAR1 \ + VADDUQM X0, Y0, T0 \ + VADDECUQ X1, Y1, CAR1, T2 \ // VACCCQ + VADDEUQM X1, Y1, CAR1, T1 \ + \ + VSPLTISB $0, ZER \ + VSUBCUQ T0, PL, CAR1 \ // VSCBIQ + VSUBUQM T0, PL, TT0 \ + VSUBECUQ T1, PH, CAR1, CAR2 \ // VSBCBIQ + VSUBEUQM T1, PH, CAR1, TT1 \ // VSBIQ + VSUBEUQM T2, ZER, CAR2, SEL1 \ + \ + VSEL TT0, T0, SEL1, T0 \ + VSEL TT1, T1, SEL1, T1 + +#define p256HalfInternal(T1, T0, X1, X0) \ + VSPLTISB $0, ZER \ + VSUBEUQM ZER, ZER, X0, SEL1 \ + \ + VADDCUQ X0, PL, CAR1 \ + VADDUQM X0, PL, T0 \ + VADDECUQ X1, PH, CAR1, T2 \ + VADDEUQM X1, PH, CAR1, T1 \ + \ + VSEL T0, X0, SEL1, T0 \ + VSEL T1, X1, SEL1, T1 \ + VSEL T2, ZER, SEL1, T2 \ + \ + VSLDOI $15, T2, ZER, TT1 \ + VSLDOI $15, T1, ZER, TT0 \ + VSPLTISB $1, SEL1 \ + VSR T0, SEL1, T0 \ // VSRL + VSR T1, SEL1, T1 \ + VSPLTISB $7, SEL1 \ // VREPIB + VSL TT0, SEL1, TT0 \ + VSL TT1, SEL1, TT1 \ + VOR T0, TT0, T0 \ + VOR T1, TT1, T1 + +// --------------------------------------- +// func p256MulAsm(res, in1, in2 []byte) +#define res_ptr R3 +#define x_ptr R4 +#define y_ptr R5 +#define CPOOL R7 +#define TEMP R8 + +// Parameters +#define X0 V0 +#define X1 V1 +#define Y0 V2 +#define Y1 V3 +#define T0 V4 +#define T1 V5 +#define X0_ VS32 +#define X1_ VS33 +#define Y0_ VS34 +#define Y1_ VS35 +#define T0_ VS36 +#define T1_ VS37 +#define SWAP V28 +#define SWAP_ VS60 + +// Constants +#define P0 V30 +#define P1 V31 +#define P0_ VS62 +#define P1_ VS63 +// +// Montgomery multiplication modulo P256 +// +TEXT ·p256MulAsm(SB), NOSPLIT, $0-72 + MOVD res+0(FP), res_ptr + MOVD in1+24(FP), x_ptr + MOVD in2+48(FP), y_ptr + MOVD $16, R16 + MOVD $32, R17 + + MOVD $p256mul<>+0x00(SB), CPOOL + MOVD $byteswap<>+0x00(SB), R8 + + LXVD2X (R8)(R0), SWAP_ + + LXVD2X (R0)(x_ptr), X0_ + LXVD2X (R16)(x_ptr), X1_ + + VPERM X0, X0, SWAP, X0 + VPERM X1, X1, SWAP, X1 + + LXVD2X (R0)(y_ptr), Y0_ + LXVD2X (R16)(y_ptr), Y1_ + + VPERM Y0, Y0, SWAP, Y0 + VPERM Y1, Y1, SWAP, Y1 + + LXVD2X (R16)(CPOOL), P1_ + LXVD2X (R0)(CPOOL), P0_ + + CALL p256MulInternal<>(SB) + + MOVD $p256mul<>+0x00(SB), CPOOL + MOVD $byteswap<>+0x00(SB), R8 + + LXVD2X (R8)(R0), SWAP_ + + VPERM T0, T0, SWAP, T0 + VPERM T1, T1, SWAP, T1 + STXVD2X T0_, (R0)(res_ptr) + STXVD2X T1_, (R16)(res_ptr) + RET + +#undef res_ptr +#undef x_ptr +#undef y_ptr +#undef CPOOL + +#undef X0 +#undef X1 +#undef Y0 +#undef Y1 +#undef T0 +#undef T1 +#undef P0 +#undef P1 +#undef X0_ +#undef X1_ +#undef Y0_ +#undef Y1_ +#undef T0_ +#undef T1_ +#undef P0_ +#undef P1_ + +// Point add with P2 being affine point +// If sign == 1 -> P2 = -P2 +// If sel == 0 -> P3 = P1 +// if zero == 0 -> P3 = P2 +// p256PointAddAffineAsm(P3, P1, P2 *p256Point, sign, sel, zero int) +#define P3ptr R3 +#define P1ptr R4 +#define P2ptr R5 +#define CPOOL R7 + +// Temporaries in REGs +#define Y2L V15 +#define Y2H V16 +#define Y2L_ VS47 +#define Y2H_ VS48 +#define T1L V17 +#define T1H V18 +#define T2L V19 +#define T2H V20 +#define T3L V21 +#define T3H V22 +#define T4L V23 +#define T4H V24 + +// Temps for Sub and Add +#define TT0 V11 +#define TT1 V12 +#define T2 V13 + +// p256MulAsm Parameters +#define X0 V0 +#define X1 V1 +#define X0_ VS32 +#define X1_ VS33 +#define Y0 V2 +#define Y1 V3 +#define Y0_ VS34 +#define Y1_ VS35 +#define T0 V4 +#define T1 V5 + +#define PL V30 +#define PH V31 +#define PL_ VS62 +#define PH_ VS63 + +// Names for zero/sel selects +#define X1L V0 +#define X1H V1 +#define X1L_ VS32 +#define X1H_ VS33 +#define Y1L V2 // p256MulAsmParmY +#define Y1H V3 // p256MulAsmParmY +#define Y1L_ VS34 +#define Y1H_ VS35 +#define Z1L V4 +#define Z1H V5 +#define Z1L_ VS36 +#define Z1H_ VS37 +#define X2L V0 +#define X2H V1 +#define X2L_ VS32 +#define X2H_ VS33 +#define Z2L V4 +#define Z2H V5 +#define Z2L_ VS36 +#define Z2H_ VS37 +#define X3L V17 // T1L +#define X3H V18 // T1H +#define Y3L V21 // T3L +#define Y3H V22 // T3H +#define Z3L V25 +#define Z3H V26 +#define X3L_ VS49 +#define X3H_ VS50 +#define Y3L_ VS53 +#define Y3H_ VS54 +#define Z3L_ VS57 +#define Z3H_ VS58 + +#define ZER V6 +#define SEL1 V7 +#define SEL1_ VS39 +#define CAR1 V8 +#define CAR2 V9 +/* * + * Three operand formula: + * Source: 2004 Hankerson–Menezes–Vanstone, page 91. + * T1 = Z1² + * T2 = T1*Z1 + * T1 = T1*X2 + * T2 = T2*Y2 + * T1 = T1-X1 + * T2 = T2-Y1 + * Z3 = Z1*T1 + * T3 = T1² + * T4 = T3*T1 + * T3 = T3*X1 + * T1 = 2*T3 + * X3 = T2² + * X3 = X3-T1 + * X3 = X3-T4 + * T3 = T3-X3 + * T3 = T3*T2 + * T4 = T4*Y1 + * Y3 = T3-T4 + + * Three operand formulas, but with MulInternal X,Y used to store temps +X=Z1; Y=Z1; MUL;T- // T1 = Z1² T1 +X=T ; Y- ; MUL;T2=T // T2 = T1*Z1 T1 T2 +X- ; Y=X2; MUL;T1=T // T1 = T1*X2 T1 T2 +X=T2; Y=Y2; MUL;T- // T2 = T2*Y2 T1 T2 +SUB(T2+0x00(SB), CPOOL + + MOVD $16, R16 + MOVD $32, R17 + MOVD $48, R18 + MOVD $64, R19 + MOVD $80, R20 + MOVD $96, R21 + MOVD $112, R22 + MOVD $128, R23 + MOVD $144, R24 + MOVD $160, R25 + MOVD $104, R26 // offset of sign+24(FP) + + MOVD $byteswap<>+0+00(SB), R8 + LXVD2X (R16)(CPOOL), PH_ + LXVD2X (R0)(CPOOL), PL_ + + // if (sign == 1) { + // Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2 = P-Y2 + // } + + LXVD2X (R8)(R0), SWAP_ + LXVD2X (R17)(P2ptr), Y2L_ + LXVD2X (R18)(P2ptr), Y2H_ + VPERM Y2H, Y2H, SWAP, Y2H + VPERM Y2L, Y2L, SWAP, Y2L + + // Equivalent of VLREPG sign+24(FP), SEL1 + LXVDSX (R1)(R26), SEL1_ + VSPLTISB $0, ZER + VCMPEQUD SEL1, ZER, SEL1 + + VSUBCUQ PL, Y2L, CAR1 + VSUBUQM PL, Y2L, T1L + VSUBEUQM PH, Y2H, CAR1, T1H + + VSEL T1L, Y2L, SEL1, Y2L + VSEL T1H, Y2H, SEL1, Y2H + +/* * + * Three operand formula: + * Source: 2004 Hankerson–Menezes–Vanstone, page 91. + */ + // X=Z1; Y=Z1; MUL; T- // T1 = Z1² T1 + LXVD2X (R8)(R0), SWAP_ + LXVD2X (R19)(P1ptr), X0_ // Z1H + LXVD2X (R20)(P1ptr), X1_ // Z1L + VPERM X0, X0, SWAP, X0 + VPERM X1, X1, SWAP, X1 + VOR X0, X0, Y0 + VOR X1, X1, Y1 + CALL p256MulInternal<>(SB) + + // X=T ; Y- ; MUL; T2=T // T2 = T1*Z1 T1 T2 + VOR T0, T0, X0 + VOR T1, T1, X1 + CALL p256MulInternal<>(SB) + VOR T0, T0, T2L + VOR T1, T1, T2H + + // X- ; Y=X2; MUL; T1=T // T1 = T1*X2 T1 T2 + MOVD in2+16(FP), P2ptr + LXVD2X (R8)(R0), SWAP_ + LXVD2X (R0)(P2ptr), Y0_ // X2H + LXVD2X (R16)(P2ptr), Y1_ // X2L + VPERM Y0, Y0, SWAP, Y0 + VPERM Y1, Y1, SWAP, Y1 + CALL p256MulInternal<>(SB) + VOR T0, T0, T1L + VOR T1, T1, T1H + + // X=T2; Y=Y2; MUL; T- // T2 = T2*Y2 T1 T2 + VOR T2L, T2L, X0 + VOR T2H, T2H, X1 + VOR Y2L, Y2L, Y0 + VOR Y2H, Y2H, Y1 + CALL p256MulInternal<>(SB) + + // SUB(T2(SB) + + // VST T1, 64(P3ptr) + // VST T0, 80(P3ptr) + VOR T0, T0, Z3L + VOR T1, T1, Z3H + + // X=Y; Y- ; MUL; X=T // T3 = T1*T1 T2 + VOR Y0, Y0, X0 + VOR Y1, Y1, X1 + CALL p256MulInternal<>(SB) + VOR T0, T0, X0 + VOR T1, T1, X1 + + // X- ; Y- ; MUL; T4=T // T4 = T3*T1 T2 T4 + CALL p256MulInternal<>(SB) + VOR T0, T0, T4L + VOR T1, T1, T4H + + // X- ; Y=X1; MUL; T3=T // T3 = T3*X1 T2 T3 T4 + MOVD in1+8(FP), P1ptr + LXVD2X (R8)(R0), SWAP_ + LXVD2X (R0)(P1ptr), Y0_ // X1H + LXVD2X (R16)(P1ptr), Y1_ // X1L + VPERM Y1, Y1, SWAP, Y1 + VPERM Y0, Y0, SWAP, Y0 + CALL p256MulInternal<>(SB) + VOR T0, T0, T3L + VOR T1, T1, T3H + + // ADD(T1(SB) + + // SUB(T(SB) + VOR T0, T0, T3L + VOR T1, T1, T3H + + // X=T4; Y=Y1; MUL; T- // T4 = T4*Y1 T3 T4 + VOR T4L, T4L, X0 + VOR T4H, T4H, X1 + MOVD in1+8(FP), P1ptr + LXVD2X (R8)(R0), SWAP_ + LXVD2X (R17)(P1ptr), Y0_ // Y1H + LXVD2X (R18)(P1ptr), Y1_ // Y1L + VPERM Y0, Y0, SWAP, Y0 + VPERM Y1, Y1, SWAP, Y1 + CALL p256MulInternal<>(SB) + + // SUB(T+0x00(SB), CPOOL + MOVD $byteswap<>+0x00(SB), R15 + + MOVD $16, R16 + MOVD $32, R17 + MOVD $48, R18 + MOVD $64, R19 + MOVD $80, R20 + + LXVD2X (R16)(CPOOL), PH_ + LXVD2X (R0)(CPOOL), PL_ + + LXVD2X (R15)(R0), SWAP_ + + // X=Z1; Y=Z1; MUL; T- // T1 = Z1² + LXVD2X (R19)(P1ptr), X0_ // Z1H + LXVD2X (R20)(P1ptr), X1_ // Z1L + + VPERM X0, X0, SWAP, X0 + VPERM X1, X1, SWAP, X1 + + VOR X0, X0, Y0 + VOR X1, X1, Y1 + CALL p256MulInternal<>(SB) + + // SUB(X(SB) + + // ADD(T2(SB) + + LXVD2X (R15)(R0), SWAP_ + + // Leave T0, T1 as is. + VPERM T0, T0, SWAP, TT0 + VPERM T1, T1, SWAP, TT1 + STXVD2X TT0_, (R19)(P3ptr) + STXVD2X TT1_, (R20)(P3ptr) + + // X- ; Y=X ; MUL; T- // Y3 = Y3² + VOR X0, X0, Y0 + VOR X1, X1, Y1 + CALL p256MulInternal<>(SB) + + // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1 + VOR T0, T0, X0 + VOR T1, T1, X1 + LXVD2X (R15)(R0), SWAP_ + LXVD2X (R0)(P1ptr), Y0_ + LXVD2X (R16)(P1ptr), Y1_ + VPERM Y0, Y0, SWAP, Y0 + VPERM Y1, Y1, SWAP, Y1 + CALL p256MulInternal<>(SB) + VOR T0, T0, T3L + VOR T1, T1, T3H + + // X- ; Y=X ; MUL; T- // Y3 = Y3² + VOR X0, X0, Y0 + VOR X1, X1, Y1 + CALL p256MulInternal<>(SB) + + // HAL(Y3(SB) + + // ADD(T1(SB) + + // SUB(Y3+0x00(SB), CPOOL + MOVD $16, R16 + MOVD $32, R17 + MOVD $48, R18 + MOVD $64, R19 + MOVD $80, R20 + + MOVD $byteswap<>+0x00(SB), R8 + LXVD2X (R16)(CPOOL), PH_ + LXVD2X (R0)(CPOOL), PL_ + + // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1 + LXVD2X (R8)(R0), SWAP_ + LXVD2X (R19)(P1ptr), X0_ // Z1L + LXVD2X (R20)(P1ptr), X1_ // Z1H + VPERM X0, X0, SWAP, X0 + VPERM X1, X1, SWAP, X1 + VOR X0, X0, Y0 + VOR X1, X1, Y1 + CALL p256MulInternal<>(SB) + + // X- ; Y=T ; MUL; R=T // R = Z1*T1 + VOR T0, T0, Y0 + VOR T1, T1, Y1 + CALL p256MulInternal<>(SB) + VOR T0, T0, RL // SAVE: RL + VOR T1, T1, RH // SAVE: RH + + STXVD2X RH_, (R1)(R17) // V27 has to be saved + + // X=X2; Y- ; MUL; H=T // H = X2*T1 + MOVD in2+16(FP), P2ptr + LXVD2X (R8)(R0), SWAP_ + LXVD2X (R0)(P2ptr), X0_ // X2L + LXVD2X (R16)(P2ptr), X1_ // X2H + VPERM X0, X0, SWAP, X0 + VPERM X1, X1, SWAP, X1 + CALL p256MulInternal<>(SB) + VOR T0, T0, HL // SAVE: HL + VOR T1, T1, HH // SAVE: HH + + // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2 + MOVD in2+16(FP), P2ptr + LXVD2X (R8)(R0), SWAP_ + LXVD2X (R19)(P2ptr), X0_ // Z2L + LXVD2X (R20)(P2ptr), X1_ // Z2H + VPERM X0, X0, SWAP, X0 + VPERM X1, X1, SWAP, X1 + VOR X0, X0, Y0 + VOR X1, X1, Y1 + CALL p256MulInternal<>(SB) + + // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2 + VOR T0, T0, Y0 + VOR T1, T1, Y1 + CALL p256MulInternal<>(SB) + VOR T0, T0, S1L // SAVE: S1L + VOR T1, T1, S1H // SAVE: S1H + + // X=X1; Y- ; MUL; U1=T // U1 = X1*T2 + MOVD in1+8(FP), P1ptr + LXVD2X (R8)(R0), SWAP_ + LXVD2X (R0)(P1ptr), X0_ // X1L + LXVD2X (R16)(P1ptr), X1_ // X1H + VPERM X0, X0, SWAP, X0 + VPERM X1, X1, SWAP, X1 + CALL p256MulInternal<>(SB) + VOR T0, T0, U1L // SAVE: U1L + VOR T1, T1, U1H // SAVE: U1H + + // SUB(H+0x00(SB), R8 + MOVD in1+8(FP), P1ptr + MOVD in2+16(FP), P2ptr + LXVD2X (R8)(R0), SWAP_ + LXVD2X (R19)(P1ptr), X0_ // Z1L + LXVD2X (R20)(P1ptr), X1_ // Z1H + VPERM X0, X0, SWAP, X0 + VPERM X1, X1, SWAP, X1 + LXVD2X (R19)(P2ptr), Y0_ // Z2L + LXVD2X (R20)(P2ptr), Y1_ // Z2H + VPERM Y0, Y0, SWAP, Y0 + VPERM Y1, Y1, SWAP, Y1 + CALL p256MulInternal<>(SB) + + // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H + VOR T0, T0, X0 + VOR T1, T1, X1 + VOR HL, HL, Y0 + VOR HH, HH, Y1 + CALL p256MulInternal<>(SB) + MOVD res+0(FP), P3ptr + LXVD2X (R8)(R0), SWAP_ + VPERM T1, T1, SWAP, TT1 + VPERM T0, T0, SWAP, TT0 + STXVD2X TT0_, (R19)(P3ptr) + STXVD2X TT1_, (R20)(P3ptr) + + // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1 + MOVD in1+8(FP), P1ptr + LXVD2X (R17)(P1ptr), X0_ + LXVD2X (R18)(P1ptr), X1_ + VPERM X0, X0, SWAP, X0 + VPERM X1, X1, SWAP, X1 + VOR S1L, S1L, Y0 + VOR S1H, S1H, Y1 + CALL p256MulInternal<>(SB) + VOR T0, T0, S1L + VOR T1, T1, S1H + + // X=Y2; Y=R ; MUL; T- // R = Y2*R + MOVD in2+16(FP), P2ptr + LXVD2X (R8)(R0), SWAP_ + LXVD2X (R17)(P2ptr), X0_ + LXVD2X (R18)(P2ptr), X1_ + VPERM X0, X0, SWAP, X0 + VPERM X1, X1, SWAP, X1 + VOR RL, RL, Y0 + + // VOR RH, RH, Y1 RH was saved above in D2X format + LXVD2X (R1)(R17), Y1_ + CALL p256MulInternal<>(SB) + + // SUB(R(SB) + + // X- ; Y=T ; MUL; T2=T // T2 = H*T1 + VOR T0, T0, Y0 + VOR T1, T1, Y1 + CALL p256MulInternal<>(SB) + VOR T0, T0, T2L + VOR T1, T1, T2H + + // X=U1; Y- ; MUL; U1=T // U1 = U1*T1 + VOR U1L, U1L, X0 + VOR U1H, U1H, X1 + CALL p256MulInternal<>(SB) + VOR T0, T0, U1L + VOR T1, T1, U1H + + // X=R ; Y=R ; MUL; T- // X3 = R*R + VOR RL, RL, X0 + + // VOR RH, RH, X1 + VOR RL, RL, Y0 + + // RH was saved above using STXVD2X + LXVD2X (R1)(R17), X1_ + VOR X1, X1, Y1 + + // VOR RH, RH, Y1 + CALL p256MulInternal<>(SB) + + // SUB(T(SB) + VOR T0, T0, U1L + VOR T1, T1, U1H + + // X=S1; Y=T2; MUL; T- // T2 = S1*T2 + VOR S1L, S1L, X0 + VOR S1H, S1H, X1 + VOR T2L, T2L, Y0 + VOR T2H, T2H, Y1 + CALL p256MulInternal<>(SB) + + // SUB(T P2 = -P2 +// If sel == 0 -> P3 = P1 +// if zero == 0 -> P3 = P2 +// +//go:noescape +func p256PointAddAffineAsm(res, in1, in2 *p256Point, sign, sel, zero int) + +// Point add +// +//go:noescape +func p256PointAddAsm(res, in1, in2 *p256Point) int + +// +//go:noescape +func p256PointDoubleAsm(res, in *p256Point) + +// The result should be a slice in LE order, but the slice +// from big.Bytes is in BE order. +// TODO: For big endian implementation, do not reverse bytes. +// +func fromBig(big *big.Int) []byte { + // This could be done a lot more efficiently... + res := big.Bytes() + t := make([]byte, 32) + if len(res) < 32 { + copy(t[32-len(res):], res) + } else if len(res) == 32 { + copy(t, res) + } else { + copy(t, res[len(res)-32:]) + } + p256ReverseBytes(t, t) + return t +} + +// p256GetMultiplier makes sure byte array will have 32 byte elements, If the scalar +// is equal or greater than the order of the group, it's reduced modulo that order. +func p256GetMultiplier(in []byte) []byte { + n := new(big.Int).SetBytes(in) + + if n.Cmp(p256Params.N) >= 0 { + n.Mod(n, p256Params.N) + } + return fromBig(n) +} + +// p256MulAsm operates in a Montgomery domain with R = 2^256 mod p, where p is the +// underlying field of the curve. (See initP256 for the value.) Thus rr here is +// R×R mod p. See comment in Inverse about how this is used. +// TODO: For big endian implementation, the bytes in these slices should be in reverse order, +// as found in the s390x implementation. +var rr = []byte{0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0, 0xff, 0xff, 0xff, 0xff, 0xfb, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfd, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00} + +// (This is one, in the Montgomery domain.) +var one = []byte{0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00} + +func maybeReduceModP(in *big.Int) *big.Int { + if in.Cmp(p256Params.P) < 0 { + return in + } + return new(big.Int).Mod(in, p256Params.P) +} + +func (curve p256CurveFast) CombinedMult(bigX, bigY *big.Int, baseScalar, scalar []byte) (x, y *big.Int) { + var r1, r2 p256Point + + scalarReduced := p256GetMultiplier(baseScalar) + r1IsInfinity := scalarIsZero(scalarReduced) + r1.p256BaseMult(scalarReduced) + + copy(r2.x[:], fromBig(maybeReduceModP(bigX))) + copy(r2.y[:], fromBig(maybeReduceModP(bigY))) + copy(r2.z[:], one) + p256MulAsm(r2.x[:], r2.x[:], rr[:]) + p256MulAsm(r2.y[:], r2.y[:], rr[:]) + + scalarReduced = p256GetMultiplier(scalar) + r2IsInfinity := scalarIsZero(scalarReduced) + r2.p256ScalarMult(scalarReduced) + + var sum, double p256Point + pointsEqual := p256PointAddAsm(&sum, &r1, &r2) + p256PointDoubleAsm(&double, &r1) + p256MovCond(&sum, &double, &sum, pointsEqual) + p256MovCond(&sum, &r1, &sum, r2IsInfinity) + p256MovCond(&sum, &r2, &sum, r1IsInfinity) + return sum.p256PointToAffine() +} + +func (curve p256CurveFast) ScalarBaseMult(scalar []byte) (x, y *big.Int) { + var r p256Point + reducedScalar := p256GetMultiplier(scalar) + r.p256BaseMult(reducedScalar) + return r.p256PointToAffine() +} + +func (curve p256CurveFast) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) { + scalarReduced := p256GetMultiplier(scalar) + var r p256Point + copy(r.x[:], fromBig(maybeReduceModP(bigX))) + copy(r.y[:], fromBig(maybeReduceModP(bigY))) + copy(r.z[:], one) + p256MulAsm(r.x[:], r.x[:], rr[:]) + p256MulAsm(r.y[:], r.y[:], rr[:]) + r.p256ScalarMult(scalarReduced) + return r.p256PointToAffine() +} + +func scalarIsZero(scalar []byte) int { + // If any byte is not zero, return 0. + // Check for -0.... since that appears to compare to 0. + b := byte(0) + for _, s := range scalar { + b |= s + } + return subtle.ConstantTimeByteEq(b, 0) +} + +func (p *p256Point) p256PointToAffine() (x, y *big.Int) { + zInv := make([]byte, 32) + zInvSq := make([]byte, 32) + + p256Inverse(zInv, p.z[:]) + p256Sqr(zInvSq, zInv) + p256MulAsm(zInv, zInv, zInvSq) + + p256MulAsm(zInvSq, p.x[:], zInvSq) + p256MulAsm(zInv, p.y[:], zInv) + + p256FromMont(zInvSq, zInvSq) + p256FromMont(zInv, zInv) + + // SetBytes expects a slice in big endian order, + // since ppc64le is little endian, reverse the bytes. + // TODO: For big endian, bytes don't need to be reversed. + p256ReverseBytes(zInvSq, zInvSq) + p256ReverseBytes(zInv, zInv) + rx := new(big.Int).SetBytes(zInvSq) + ry := new(big.Int).SetBytes(zInv) + return rx, ry +} + +// p256Inverse sets out to in^-1 mod p. +func p256Inverse(out, in []byte) { + var stack [6 * 32]byte + p2 := stack[32*0 : 32*0+32] + p4 := stack[32*1 : 32*1+32] + p8 := stack[32*2 : 32*2+32] + p16 := stack[32*3 : 32*3+32] + p32 := stack[32*4 : 32*4+32] + + p256Sqr(out, in) + p256MulAsm(p2, out, in) // 3*p + + p256Sqr(out, p2) + p256Sqr(out, out) + p256MulAsm(p4, out, p2) // f*p + + p256Sqr(out, p4) + p256Sqr(out, out) + p256Sqr(out, out) + p256Sqr(out, out) + p256MulAsm(p8, out, p4) // ff*p + + p256Sqr(out, p8) + + for i := 0; i < 7; i++ { + p256Sqr(out, out) + } + p256MulAsm(p16, out, p8) // ffff*p + + p256Sqr(out, p16) + for i := 0; i < 15; i++ { + p256Sqr(out, out) + } + p256MulAsm(p32, out, p16) // ffffffff*p + + p256Sqr(out, p32) + + for i := 0; i < 31; i++ { + p256Sqr(out, out) + } + p256MulAsm(out, out, in) + + for i := 0; i < 32*4; i++ { + p256Sqr(out, out) + } + p256MulAsm(out, out, p32) + + for i := 0; i < 32; i++ { + p256Sqr(out, out) + } + p256MulAsm(out, out, p32) + + for i := 0; i < 16; i++ { + p256Sqr(out, out) + } + p256MulAsm(out, out, p16) + + for i := 0; i < 8; i++ { + p256Sqr(out, out) + } + p256MulAsm(out, out, p8) + + p256Sqr(out, out) + p256Sqr(out, out) + p256Sqr(out, out) + p256Sqr(out, out) + p256MulAsm(out, out, p4) + + p256Sqr(out, out) + p256Sqr(out, out) + p256MulAsm(out, out, p2) + + p256Sqr(out, out) + p256Sqr(out, out) + p256MulAsm(out, out, in) +} + +func boothW5(in uint) (int, int) { + var s uint = ^((in >> 5) - 1) + var d uint = (1 << 6) - in - 1 + d = (d & s) | (in & (^s)) + d = (d >> 1) + (d & 1) + return int(d), int(s & 1) +} + +func boothW6(in uint) (int, int) { + var s uint = ^((in >> 6) - 1) + var d uint = (1 << 7) - in - 1 + d = (d & s) | (in & (^s)) + d = (d >> 1) + (d & 1) + return int(d), int(s & 1) +} + +func boothW7(in uint) (int, int) { + var s uint = ^((in >> 7) - 1) + var d uint = (1 << 8) - in - 1 + d = (d & s) | (in & (^s)) + d = (d >> 1) + (d & 1) + return int(d), int(s & 1) +} + +func initTable() { + + p256PreFast = new([37][64]p256Point) + + // TODO: For big endian, these slices should be in reverse byte order, + // as found in the s390x implementation. + basePoint := p256Point{ + x: [32]byte{0x3c, 0x14, 0xa9, 0x18, 0xd4, 0x30, 0xe7, 0x79, 0x01, 0xb6, 0xed, 0x5f, 0xfc, 0x95, 0xba, 0x75, + 0x10, 0x25, 0x62, 0x77, 0x2b, 0x73, 0xfb, 0x79, 0xc6, 0x55, 0x37, 0xa5, 0x76, 0x5f, 0x90, 0x18}, //(p256.x*2^256)%p + y: [32]byte{0x0a, 0x56, 0x95, 0xce, 0x57, 0x53, 0xf2, 0xdd, 0x5c, 0xe4, 0x19, 0xba, 0xe4, 0xb8, 0x4a, 0x8b, + 0x25, 0xf3, 0x21, 0xdd, 0x88, 0x86, 0xe8, 0xd2, 0x85, 0x5d, 0x88, 0x25, 0x18, 0xff, 0x71, 0x85}, //(p256.y*2^256)%p + z: [32]byte{0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}, //(p256.z*2^256)%p + + } + + t1 := new(p256Point) + t2 := new(p256Point) + *t2 = basePoint + + zInv := make([]byte, 32) + zInvSq := make([]byte, 32) + for j := 0; j < 64; j++ { + *t1 = *t2 + for i := 0; i < 37; i++ { + // The window size is 7 so we need to double 7 times. + if i != 0 { + for k := 0; k < 7; k++ { + p256PointDoubleAsm(t1, t1) + } + } + // Convert the point to affine form. (Its values are + // still in Montgomery form however.) + p256Inverse(zInv, t1.z[:]) + p256Sqr(zInvSq, zInv) + p256MulAsm(zInv, zInv, zInvSq) + + p256MulAsm(t1.x[:], t1.x[:], zInvSq) + p256MulAsm(t1.y[:], t1.y[:], zInv) + + copy(t1.z[:], basePoint.z[:]) + // Update the table entry + copy(p256PreFast[i][j].x[:], t1.x[:]) + copy(p256PreFast[i][j].y[:], t1.y[:]) + } + if j == 0 { + p256PointDoubleAsm(t2, &basePoint) + } else { + p256PointAddAsm(t2, t2, &basePoint) + } + } +} + +func (p *p256Point) p256BaseMult(scalar []byte) { + // TODO: For big endian, the index should be 31 not 0. + wvalue := (uint(scalar[0]) << 1) & 0xff + sel, sign := boothW7(uint(wvalue)) + p256SelectBase(p, p256PreFast[0][:], sel) + p256NegCond(p, sign) + + copy(p.z[:], one[:]) + var t0 p256Point + + copy(t0.z[:], one[:]) + + index := uint(6) + zero := sel + for i := 1; i < 37; i++ { + // TODO: For big endian, use the same index values as found + // in the s390x implementation. + if index < 247 { + wvalue = ((uint(scalar[index/8]) >> (index % 8)) + (uint(scalar[index/8+1]) << (8 - (index % 8)))) & 0xff + } else { + wvalue = (uint(scalar[index/8]) >> (index % 8)) & 0xff + } + index += 7 + sel, sign = boothW7(uint(wvalue)) + p256SelectBase(&t0, p256PreFast[i][:], sel) + p256PointAddAffineAsm(p, p, &t0, sign, sel, zero) + zero |= sel + } +} + +func (p *p256Point) p256ScalarMult(scalar []byte) { + // precomp is a table of precomputed points that stores powers of p + // from p^1 to p^16. + var precomp [16]p256Point + var t0, t1, t2, t3 p256Point + + *&precomp[0] = *p + p256PointDoubleAsm(&t0, p) + p256PointDoubleAsm(&t1, &t0) + p256PointDoubleAsm(&t2, &t1) + p256PointDoubleAsm(&t3, &t2) + *&precomp[1] = t0 + *&precomp[3] = t1 + *&precomp[7] = t2 + *&precomp[15] = t3 + + p256PointAddAsm(&t0, &t0, p) + p256PointAddAsm(&t1, &t1, p) + p256PointAddAsm(&t2, &t2, p) + + *&precomp[2] = t0 + *&precomp[4] = t1 + *&precomp[8] = t2 + + p256PointDoubleAsm(&t0, &t0) + p256PointDoubleAsm(&t1, &t1) + *&precomp[5] = t0 + *&precomp[9] = t1 + + p256PointAddAsm(&t2, &t0, p) + p256PointAddAsm(&t1, &t1, p) + *&precomp[6] = t2 + *&precomp[10] = t1 + + p256PointDoubleAsm(&t0, &t0) + p256PointDoubleAsm(&t2, &t2) + *&precomp[11] = t0 + *&precomp[13] = t2 + + p256PointAddAsm(&t0, &t0, p) + p256PointAddAsm(&t2, &t2, p) + *&precomp[12] = t0 + *&precomp[14] = t2 + + // Start scanning the window from top bit + index := uint(254) + var sel, sign int + + // TODO: For big endian, use index found in s390x implementation. + wvalue := (uint(scalar[index/8]) >> (index % 8)) & 0x3f + sel, _ = boothW5(uint(wvalue)) + p256Select(p, precomp[:], sel) + zero := sel + + for index > 4 { + index -= 5 + p256PointDoubleAsm(p, p) + p256PointDoubleAsm(p, p) + p256PointDoubleAsm(p, p) + p256PointDoubleAsm(p, p) + p256PointDoubleAsm(p, p) + + // TODO: For big endian, use index values as found in s390x implementation. + if index < 247 { + wvalue = ((uint(scalar[index/8]) >> (index % 8)) + (uint(scalar[index/8+1]) << (8 - (index % 8)))) & 0x3f + } else { + wvalue = (uint(scalar[index/8]) >> (index % 8)) & 0x3f + } + + sel, sign = boothW5(uint(wvalue)) + + p256Select(&t0, precomp[:], sel) + p256NegCond(&t0, sign) + p256PointAddAsm(&t1, p, &t0) + p256MovCond(&t1, &t1, p, sel) + p256MovCond(p, &t1, &t0, zero) + zero |= sel + } + + p256PointDoubleAsm(p, p) + p256PointDoubleAsm(p, p) + p256PointDoubleAsm(p, p) + p256PointDoubleAsm(p, p) + p256PointDoubleAsm(p, p) + + // TODO: Use index for big endian as found in s390x implementation. + wvalue = (uint(scalar[0]) << 1) & 0x3f + sel, sign = boothW5(uint(wvalue)) + + p256Select(&t0, precomp[:], sel) + p256NegCond(&t0, sign) + p256PointAddAsm(&t1, p, &t0) + p256MovCond(&t1, &t1, p, sel) + p256MovCond(p, &t1, &t0, zero) +}