From 56a7c5b95c01ccf23914ddb70193c3943ae5aaa0 Mon Sep 17 00:00:00 2001 From: Josh Bleecher Snyder Date: Tue, 21 Apr 2015 10:33:30 -0700 Subject: [PATCH] math/big: add partial arm64 assembly support benchmark old ns/op new ns/op delta BenchmarkAddVV_1 18.7 14.8 -20.86% BenchmarkAddVV_2 21.8 16.6 -23.85% BenchmarkAddVV_3 26.1 17.1 -34.48% BenchmarkAddVV_4 30.4 21.9 -27.96% BenchmarkAddVV_5 35.5 19.8 -44.23% BenchmarkAddVV_1e1 63.0 28.3 -55.08% BenchmarkAddVV_1e2 593 178 -69.98% BenchmarkAddVV_1e3 5691 1490 -73.82% BenchmarkAddVV_1e4 56868 20761 -63.49% BenchmarkAddVV_1e5 569062 207679 -63.51% BenchmarkAddVW_1 15.8 12.6 -20.25% BenchmarkAddVW_2 17.8 13.1 -26.40% BenchmarkAddVW_3 21.2 13.9 -34.43% BenchmarkAddVW_4 23.6 14.7 -37.71% BenchmarkAddVW_5 26.0 15.8 -39.23% BenchmarkAddVW_1e1 41.3 21.6 -47.70% BenchmarkAddVW_1e2 383 145 -62.14% BenchmarkAddVW_1e3 3703 1264 -65.87% BenchmarkAddVW_1e4 36920 14359 -61.11% BenchmarkAddVW_1e5 370345 143046 -61.37% BenchmarkAddMulVVW_1 33.2 32.5 -2.11% BenchmarkAddMulVVW_2 58.0 57.2 -1.38% BenchmarkAddMulVVW_3 95.2 93.9 -1.37% BenchmarkAddMulVVW_4 108 106 -1.85% BenchmarkAddMulVVW_5 159 156 -1.89% BenchmarkAddMulVVW_1e1 344 340 -1.16% BenchmarkAddMulVVW_1e2 3644 3624 -0.55% BenchmarkAddMulVVW_1e3 37344 37208 -0.36% BenchmarkAddMulVVW_1e4 373295 372170 -0.30% BenchmarkAddMulVVW_1e5 3438116 3425606 -0.36% BenchmarkBitLen0 7.21 4.32 -40.08% BenchmarkBitLen1 6.49 4.32 -33.44% BenchmarkBitLen2 7.23 4.32 -40.25% BenchmarkBitLen3 6.49 4.32 -33.44% BenchmarkBitLen4 7.22 4.32 -40.17% BenchmarkBitLen5 6.52 4.33 -33.59% BenchmarkBitLen8 7.22 4.32 -40.17% BenchmarkBitLen9 6.49 4.32 -33.44% BenchmarkBitLen16 8.66 4.32 -50.12% BenchmarkBitLen17 7.95 4.32 -45.66% BenchmarkBitLen31 8.69 4.32 -50.29% BenchmarkGCD10x10 5021 5033 +0.24% BenchmarkGCD10x100 5571 5572 +0.02% BenchmarkGCD10x1000 6707 6729 +0.33% BenchmarkGCD10x10000 13526 13419 -0.79% BenchmarkGCD10x100000 85668 83242 -2.83% BenchmarkGCD100x100 24196 23936 -1.07% BenchmarkGCD100x1000 28802 27309 -5.18% BenchmarkGCD100x10000 64111 51704 -19.35% BenchmarkGCD100x100000 385840 274385 -28.89% BenchmarkGCD1000x1000 262892 236269 -10.13% BenchmarkGCD1000x10000 371393 277883 -25.18% BenchmarkGCD1000x100000 1311795 589055 -55.10% BenchmarkGCD10000x10000 9596740 6123930 -36.19% BenchmarkGCD10000x100000 16404000 7269610 -55.68% BenchmarkGCD100000x100000 776660000 419270000 -46.02% BenchmarkHilbert 13478980 13402270 -0.57% BenchmarkBinomial 9802 9440 -3.69% BenchmarkBitset 142 142 +0.00% BenchmarkBitsetNeg 328 279 -14.94% BenchmarkBitsetOrig 853 861 +0.94% BenchmarkBitsetNegOrig 1489 1444 -3.02% BenchmarkMul 420949000 410481000 -2.49% BenchmarkExp3Power0x10 1148 1229 +7.06% BenchmarkExp3Power0x40 1322 1376 +4.08% BenchmarkExp3Power0x100 2437 2486 +2.01% BenchmarkExp3Power0x400 9456 9346 -1.16% BenchmarkExp3Power0x1000 113623 108701 -4.33% BenchmarkExp3Power0x4000 1134933 1101481 -2.95% BenchmarkExp3Power0x10000 10773570 10396160 -3.50% BenchmarkExp3Power0x40000 101362100 97788300 -3.53% BenchmarkExp3Power0x100000 921114000 885249000 -3.89% BenchmarkExp3Power0x400000 8323094000 7969020000 -4.25% BenchmarkFibo 322021600 92554450 -71.26% BenchmarkScanPi 1264583 321065 -74.61% BenchmarkStringPiParallel 1644661 554216 -66.30% BenchmarkScan10Base2 1111 1080 -2.79% BenchmarkScan100Base2 6645 6345 -4.51% BenchmarkScan1000Base2 84084 62405 -25.78% BenchmarkScan10000Base2 3105998 932551 -69.98% BenchmarkScan100000Base2 257234800 40113333 -84.41% BenchmarkScan10Base8 571 573 +0.35% BenchmarkScan100Base8 2810 2543 -9.50% BenchmarkScan1000Base8 47383 25834 -45.48% BenchmarkScan10000Base8 2739518 567203 -79.30% BenchmarkScan100000Base8 253952400 36495680 -85.63% BenchmarkScan10Base10 553 556 +0.54% BenchmarkScan100Base10 2640 2385 -9.66% BenchmarkScan1000Base10 50865 24049 -52.72% BenchmarkScan10000Base10 3279916 549313 -83.25% BenchmarkScan100000Base10 309121000 36213140 -88.29% BenchmarkScan10Base16 478 483 +1.05% BenchmarkScan100Base16 2353 2144 -8.88% BenchmarkScan1000Base16 48091 24246 -49.58% BenchmarkScan10000Base16 2858886 586475 -79.49% BenchmarkScan100000Base16 266320000 38190500 -85.66% BenchmarkString10Base2 736 730 -0.82% BenchmarkString100Base2 2695 2707 +0.45% BenchmarkString1000Base2 20549 20388 -0.78% BenchmarkString10000Base2 212638 210782 -0.87% BenchmarkString100000Base2 1944963 1938033 -0.36% BenchmarkString10Base8 524 517 -1.34% BenchmarkString100Base8 1326 1320 -0.45% BenchmarkString1000Base8 8213 8249 +0.44% BenchmarkString10000Base8 72204 72092 -0.16% BenchmarkString100000Base8 769068 765993 -0.40% BenchmarkString10Base10 1018 982 -3.54% BenchmarkString100Base10 3485 3206 -8.01% BenchmarkString1000Base10 37102 18935 -48.97% BenchmarkString10000Base10 188633 88637 -53.01% BenchmarkString100000Base10 124490300 19700940 -84.17% BenchmarkString10Base16 509 502 -1.38% BenchmarkString100Base16 1084 1098 +1.29% BenchmarkString1000Base16 5641 5650 +0.16% BenchmarkString10000Base16 46900 46745 -0.33% BenchmarkString100000Base16 508957 505840 -0.61% BenchmarkLeafSize0 8934320 8149465 -8.78% BenchmarkLeafSize1 237666 118381 -50.19% BenchmarkLeafSize2 237807 117854 -50.44% BenchmarkLeafSize3 1688640 353494 -79.07% BenchmarkLeafSize4 235676 116196 -50.70% BenchmarkLeafSize5 2121896 430325 -79.72% BenchmarkLeafSize6 1682306 351775 -79.09% BenchmarkLeafSize7 1051847 251436 -76.10% BenchmarkLeafSize8 232697 115674 -50.29% BenchmarkLeafSize9 2403616 488443 -79.68% BenchmarkLeafSize10 2120975 429545 -79.75% BenchmarkLeafSize11 2023789 426525 -78.92% BenchmarkLeafSize12 1684830 351985 -79.11% BenchmarkLeafSize13 1465529 337906 -76.94% BenchmarkLeafSize14 1050498 253872 -75.83% BenchmarkLeafSize15 683228 197384 -71.11% BenchmarkLeafSize16 232496 116026 -50.10% BenchmarkLeafSize32 245841 126671 -48.47% BenchmarkLeafSize64 301728 190285 -36.93% Change-Id: I63e63297896d96b89c9a275b893c2b405a7e105d Reviewed-on: https://go-review.googlesource.com/9260 Reviewed-by: David Crawshaw --- src/math/big/arith_arm64.s | 149 ++++++++++++++++++++++++++++++++++--- 1 file changed, 140 insertions(+), 9 deletions(-) diff --git a/src/math/big/arith_arm64.s b/src/math/big/arith_arm64.s index 6e10e47be3..24a717cbb0 100644 --- a/src/math/big/arith_arm64.s +++ b/src/math/big/arith_arm64.s @@ -9,38 +9,169 @@ // This file provides fast assembly versions for the elementary // arithmetic operations on vectors implemented in arith.go. +// TODO: Consider re-implementing using Advanced SIMD +// once the assembler supports those instructions. + +// func mulWW(x, y Word) (z1, z0 Word) TEXT ·mulWW(SB),NOSPLIT,$0 - B ·mulWW_g(SB) + MOVD x+0(FP), R0 + MOVD y+8(FP), R1 + MUL R0, R1, R2 + UMULH R0, R1, R3 + MOVD R3, z1+16(FP) + MOVD R2, z0+24(FP) + RET + +// func divWW(x1, x0, y Word) (q, r Word) TEXT ·divWW(SB),NOSPLIT,$0 - B ·divWW_g(SB) + B ·divWW_g(SB) // ARM64 has no multiword division + +// func addVV(z, x, y []Word) (c Word) TEXT ·addVV(SB),NOSPLIT,$0 - B ·addVV_g(SB) + MOVD z+0(FP), R3 + MOVD z_len+8(FP), R0 + MOVD x+24(FP), R1 + MOVD y+48(FP), R2 + ADDS $0, R0 // clear carry flag +loop: + CBZ R0, done // careful not to touch the carry flag + MOVD.P 8(R1), R4 + MOVD.P 8(R2), R5 + ADCS R4, R5 + MOVD.P R5, 8(R3) + SUB $1, R0 + B loop +done: + CSET HS, R0 // extract carry flag + MOVD R0, c+72(FP) + RET + +// func subVV(z, x, y []Word) (c Word) TEXT ·subVV(SB),NOSPLIT,$0 - B ·subVV_g(SB) + MOVD z+0(FP), R3 + MOVD z_len+8(FP), R0 + MOVD x+24(FP), R1 + MOVD y+48(FP), R2 + CMP R0, R0 // set carry flag +loop: + CBZ R0, done // careful not to touch the carry flag + MOVD.P 8(R1), R4 + MOVD.P 8(R2), R5 + SBCS R5, R4 + MOVD.P R4, 8(R3) + SUB $1, R0 + B loop +done: + CSET LO, R0 // extract carry flag + MOVD R0, c+72(FP) + RET + +// func addVW(z, x []Word, y Word) (c Word) TEXT ·addVW(SB),NOSPLIT,$0 - B ·addVW_g(SB) + MOVD z+0(FP), R3 + MOVD z_len+8(FP), R0 + MOVD x+24(FP), R1 + MOVD y+48(FP), R2 + CBZ R0, return_y + MOVD.P 8(R1), R4 + ADDS R2, R4 + MOVD.P R4, 8(R3) + SUB $1, R0 +loop: + CBZ R0, done // careful not to touch the carry flag + MOVD.P 8(R1), R4 + ADCS $0, R4 + MOVD.P R4, 8(R3) + SUB $1, R0 + B loop +done: + CSET HS, R0 // extract carry flag + MOVD R0, c+56(FP) + RET +return_y: // z is empty; copy y to c + MOVD R2, c+56(FP) + RET + +// func subVW(z, x []Word, y Word) (c Word) TEXT ·subVW(SB),NOSPLIT,$0 - B ·subVW_g(SB) + MOVD z+0(FP), R3 + MOVD z_len+8(FP), R0 + MOVD x+24(FP), R1 + MOVD y+48(FP), R2 + CBZ R0, rety + MOVD.P 8(R1), R4 + SUBS R2, R4 + MOVD.P R4, 8(R3) + SUB $1, R0 +loop: + CBZ R0, done // careful not to touch the carry flag + MOVD.P 8(R1), R4 + SBCS $0, R4 + MOVD.P R4, 8(R3) + SUB $1, R0 + B loop +done: + CSET LO, R0 // extract carry flag + MOVD R0, c+56(FP) + RET +rety: // z is empty; copy y to c + MOVD R2, c+56(FP) + RET + +// func shlVU(z, x []Word, s uint) (c Word) TEXT ·shlVU(SB),NOSPLIT,$0 B ·shlVU_g(SB) + +// func shrVU(z, x []Word, s uint) (c Word) TEXT ·shrVU(SB),NOSPLIT,$0 B ·shrVU_g(SB) -TEXT ·mulAddVWW(SB),NOSPLIT,$0 - B ·mulAddVWW_g(SB) +// func mulAddVWW(z, x []Word, y, r Word) (c Word) +TEXT ·mulAddVWW(SB),NOSPLIT,$0 + MOVD z+0(FP), R1 + MOVD z_len+8(FP), R0 + MOVD x+24(FP), R2 + MOVD y+48(FP), R3 + MOVD r+56(FP), R4 +loop: + CBZ R0, done + MOVD.P 8(R2), R5 + UMULH R5, R3, R7 + MUL R5, R3, R6 + ADDS R4, R6 + ADC $0, R7 + MOVD.P R6, 8(R1) + MOVD R7, R4 + SUB $1, R0 + B loop +done: + MOVD R4, c+64(FP) + RET + + +// func addMulVVW(z, x []Word, y Word) (c Word) TEXT ·addMulVVW(SB),NOSPLIT,$0 B ·addMulVVW_g(SB) + +// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) TEXT ·divWVW(SB),NOSPLIT,$0 B ·divWVW_g(SB) + +// func bitLen(x Word) (n int) TEXT ·bitLen(SB),NOSPLIT,$0 - B ·bitLen_g(SB) + MOVD x+0(FP), R0 + CLZ R0, R0 + MOVD $64, R1 + SUB R0, R1, R0 + MOVD R0, n+8(FP) + RET