mirror of
https://github.com/golang/go
synced 2024-11-22 03:24:41 -07:00
runtime: optimize the function memmove on loong64
benchmarck on 3A6000: goos: linux goarch: loong64 pkg: runtime cpu: Loongson-3A6000 @ 2500.00MHz │ old │ new │ │ sec/op │ sec/op vs base │ Memmove/0 0.6003n ± 0% 0.6003n ± 0% ~ (p=0.487 n=20) Memmove/1 4.402n ± 0% 2.815n ± 0% -36.05% (p=0.000 n=20) Memmove/2 5.202n ± 0% 3.202n ± 0% -38.45% (p=0.000 n=20) Memmove/3 6.003n ± 0% 2.820n ± 0% -53.02% (p=0.000 n=20) Memmove/4 6.803n ± 0% 3.202n ± 0% -52.93% (p=0.000 n=20) Memmove/5 7.604n ± 0% 3.202n ± 0% -57.89% (p=0.000 n=20) Memmove/6 8.404n ± 0% 3.202n ± 0% -61.90% (p=0.000 n=20) Memmove/7 9.204n ± 0% 3.202n ± 0% -65.21% (p=0.000 n=20) Memmove/8 4.802n ± 0% 3.602n ± 0% -24.99% (p=0.000 n=20) Memmove/9 6.003n ± 0% 3.202n ± 0% -46.66% (p=0.000 n=20) Memmove/10 6.803n ± 0% 3.202n ± 0% -52.93% (p=0.000 n=20) Memmove/11 7.604n ± 0% 3.202n ± 0% -57.89% (p=0.000 n=20) Memmove/12 8.404n ± 0% 3.202n ± 0% -61.90% (p=0.000 n=20) Memmove/13 9.204n ± 0% 3.202n ± 0% -65.21% (p=0.000 n=20) Memmove/14 10.000n ± 0% 3.202n ± 0% -67.98% (p=0.000 n=20) Memmove/15 10.810n ± 0% 3.202n ± 0% -70.38% (p=0.000 n=20) Memmove/16 6.003n ± 0% 3.202n ± 0% -46.66% (p=0.000 n=20) Memmove/32 7.604n ± 0% 3.602n ± 0% -52.63% (p=0.000 n=20) Memmove/64 10.810n ± 0% 4.402n ± 0% -59.28% (p=0.000 n=20) Memmove/128 17.210n ± 0% 8.004n ± 0% -53.49% (p=0.000 n=20) Memmove/256 30.41n ± 0% 10.81n ± 0% -64.45% (p=0.000 n=20) Memmove/512 56.03n ± 0% 17.81n ± 0% -68.21% (p=0.000 n=20) Memmove/1024 107.30n ± 0% 30.62n ± 0% -71.46% (p=0.000 n=20) Memmove/2048 209.70n ± 0% 56.23n ± 0% -73.19% (p=0.000 n=20) Memmove/4096 414.6n ± 0% 107.5n ± 0% -74.07% (p=0.000 n=20) MemmoveOverlap/32 8.404n ± 0% 4.402n ± 0% -47.62% (p=0.000 n=20) MemmoveOverlap/64 11.610n ± 0% 5.003n ± 0% -56.91% (p=0.000 n=20) MemmoveOverlap/128 18.010n ± 0% 9.005n ± 0% -50.00% (p=0.000 n=20) MemmoveOverlap/256 31.22n ± 0% 12.41n ± 0% -60.25% (p=0.000 n=20) MemmoveOverlap/512 56.83n ± 0% 19.08n ± 0% -66.43% (p=0.000 n=20) MemmoveOverlap/1024 108.10n ± 0% 32.00n ± 0% -70.40% (p=0.000 n=20) MemmoveOverlap/2048 210.50n ± 0% 57.94n ± 0% -72.48% (p=0.000 n=20) MemmoveOverlap/4096 415.4n ± 0% 108.9n ± 0% -73.78% (p=0.000 n=20) MemmoveUnalignedDst/0 2.448n ± 0% 2.942n ± 0% +20.16% (p=0.000 n=20) MemmoveUnalignedDst/1 4.802n ± 0% 3.202n ± 0% -33.32% (p=0.000 n=20) MemmoveUnalignedDst/2 5.603n ± 0% 3.602n ± 0% -35.71% (p=0.000 n=20) MemmoveUnalignedDst/3 6.403n ± 0% 3.202n ± 0% -49.99% (p=0.000 n=20) MemmoveUnalignedDst/4 7.203n ± 0% 3.602n ± 0% -49.99% (p=0.000 n=20) MemmoveUnalignedDst/5 8.004n ± 0% 3.602n ± 0% -55.00% (p=0.000 n=20) MemmoveUnalignedDst/6 8.804n ± 0% 3.602n ± 0% -59.09% (p=0.000 n=20) MemmoveUnalignedDst/7 9.605n ± 0% 3.602n ± 0% -62.50% (p=0.000 n=20) MemmoveUnalignedDst/8 10.400n ± 0% 4.002n ± 0% -61.52% (p=0.000 n=20) MemmoveUnalignedDst/9 11.210n ± 0% 3.802n ± 0% -66.08% (p=0.000 n=20) MemmoveUnalignedDst/10 12.010n ± 0% 3.802n ± 0% -68.34% (p=0.000 n=20) MemmoveUnalignedDst/11 12.810n ± 0% 3.802n ± 0% -70.32% (p=0.000 n=20) MemmoveUnalignedDst/12 13.610n ± 0% 3.802n ± 0% -72.06% (p=0.000 n=20) MemmoveUnalignedDst/13 14.410n ± 0% 3.802n ± 0% -73.62% (p=0.000 n=20) MemmoveUnalignedDst/14 15.210n ± 0% 3.802n ± 0% -75.00% (p=0.000 n=20) MemmoveUnalignedDst/15 16.010n ± 0% 3.802n ± 0% -76.25% (p=0.000 n=20) MemmoveUnalignedDst/16 17.210n ± 0% 3.802n ± 0% -77.91% (p=0.000 n=20) MemmoveUnalignedDst/32 30.020n ± 0% 4.202n ± 0% -86.00% (p=0.000 n=20) MemmoveUnalignedDst/64 56.030n ± 0% 6.804n ± 0% -87.86% (p=0.000 n=20) MemmoveUnalignedDst/128 106.90n ± 0% 13.61n ± 0% -87.27% (p=0.000 n=20) MemmoveUnalignedDst/256 209.50n ± 0% 17.07n ± 1% -91.85% (p=0.000 n=20) MemmoveUnalignedDst/512 414.60n ± 0% 24.95n ± 0% -93.98% (p=0.000 n=20) MemmoveUnalignedDst/1024 828.40n ± 0% 42.82n ± 0% -94.83% (p=0.000 n=20) MemmoveUnalignedDst/2048 1648.00n ± 0% 78.04n ± 0% -95.26% (p=0.000 n=20) MemmoveUnalignedDst/4096 3287.0n ± 0% 148.4n ± 0% -95.49% (p=0.000 n=20) MemmoveUnalignedDstOverlap/32 30.810n ± 0% 5.603n ± 0% -81.81% (p=0.000 n=20) MemmoveUnalignedDstOverlap/64 56.430n ± 0% 7.604n ± 0% -86.52% (p=0.000 n=20) MemmoveUnalignedDstOverlap/128 107.700n ± 0% 9.812n ± 0% -90.89% (p=0.000 n=20) MemmoveUnalignedDstOverlap/256 210.10n ± 0% 13.50n ± 0% -93.57% (p=0.000 n=20) MemmoveUnalignedDstOverlap/512 415.00n ± 0% 21.21n ± 0% -94.89% (p=0.000 n=20) MemmoveUnalignedDstOverlap/1024 828.80n ± 0% 41.02n ± 0% -95.05% (p=0.000 n=20) MemmoveUnalignedDstOverlap/2048 1648.00n ± 0% 80.23n ± 0% -95.13% (p=0.000 n=20) MemmoveUnalignedDstOverlap/4096 3288.0n ± 0% 162.4n ± 0% -95.06% (p=0.000 n=20) MemmoveUnalignedSrc/0 2.468n ± 1% 2.913n ± 0% +18.01% (p=0.000 n=20) MemmoveUnalignedSrc/1 4.802n ± 0% 3.202n ± 0% -33.32% (p=0.000 n=20) MemmoveUnalignedSrc/2 5.603n ± 0% 3.603n ± 0% -35.70% (p=0.000 n=20) MemmoveUnalignedSrc/3 6.403n ± 0% 3.207n ± 0% -49.91% (p=0.000 n=20) MemmoveUnalignedSrc/4 7.203n ± 0% 3.603n ± 0% -49.98% (p=0.000 n=20) MemmoveUnalignedSrc/5 8.004n ± 0% 3.602n ± 0% -55.00% (p=0.000 n=20) MemmoveUnalignedSrc/6 8.804n ± 0% 3.602n ± 0% -59.09% (p=0.000 n=20) MemmoveUnalignedSrc/7 9.605n ± 0% 3.602n ± 0% -62.50% (p=0.000 n=20) MemmoveUnalignedSrc/8 10.410n ± 0% 4.002n ± 0% -61.56% (p=0.000 n=20) MemmoveUnalignedSrc/9 11.210n ± 0% 3.802n ± 0% -66.08% (p=0.000 n=20) MemmoveUnalignedSrc/10 12.010n ± 0% 3.802n ± 0% -68.34% (p=0.000 n=20) MemmoveUnalignedSrc/11 12.810n ± 0% 3.802n ± 0% -70.32% (p=0.000 n=20) MemmoveUnalignedSrc/12 13.610n ± 0% 3.802n ± 0% -72.06% (p=0.000 n=20) MemmoveUnalignedSrc/13 14.410n ± 0% 3.802n ± 0% -73.62% (p=0.000 n=20) MemmoveUnalignedSrc/14 15.210n ± 0% 3.802n ± 0% -75.00% (p=0.000 n=20) MemmoveUnalignedSrc/15 16.010n ± 0% 3.802n ± 0% -76.25% (p=0.000 n=20) MemmoveUnalignedSrc/16 16.810n ± 0% 3.802n ± 0% -77.38% (p=0.000 n=20) MemmoveUnalignedSrc/32 30.410n ± 0% 4.301n ± 0% -85.86% (p=0.000 n=20) MemmoveUnalignedSrc/64 55.630n ± 0% 5.203n ± 0% -90.65% (p=0.000 n=20) MemmoveUnalignedSrc/128 107.300n ± 0% 8.805n ± 0% -91.79% (p=0.000 n=20) MemmoveUnalignedSrc/256 209.50n ± 0% 12.41n ± 6% -94.08% (p=0.000 n=20) MemmoveUnalignedSrc/512 414.20n ± 0% 20.41n ± 0% -95.07% (p=0.000 n=20) MemmoveUnalignedSrc/1024 828.00n ± 0% 36.92n ± 0% -95.54% (p=0.000 n=20) MemmoveUnalignedSrc/2048 1648.00n ± 0% 71.41n ± 0% -95.67% (p=0.000 n=20) MemmoveUnalignedSrc/4096 3287.0n ± 0% 132.2n ± 0% -95.98% (p=0.000 n=20) MemmoveUnalignedSrcDst/f_16_0 7.203n ± 0% 5.002n ± 0% -30.56% (p=0.000 n=20) MemmoveUnalignedSrcDst/b_16_0 7.604n ± 0% 5.002n ± 0% -34.22% (p=0.000 n=20) MemmoveUnalignedSrcDst/f_16_1 13.210n ± 0% 5.002n ± 0% -62.13% (p=0.000 n=20) MemmoveUnalignedSrcDst/b_16_1 13.210n ± 0% 5.002n ± 0% -62.13% (p=0.000 n=20) MemmoveUnalignedSrcDst/f_16_4 13.210n ± 0% 5.002n ± 0% -62.13% (p=0.000 n=20) MemmoveUnalignedSrcDst/b_16_4 13.610n ± 0% 5.002n ± 0% -63.24% (p=0.000 n=20) MemmoveUnalignedSrcDst/f_16_7 12.810n ± 0% 5.002n ± 0% -60.95% (p=0.000 n=20) MemmoveUnalignedSrcDst/b_16_7 13.610n ± 0% 5.002n ± 0% -63.24% (p=0.000 n=20) MemmoveUnalignedSrcDst/f_64_0 12.010n ± 0% 7.191n ± 0% -40.12% (p=0.000 n=20) MemmoveUnalignedSrcDst/b_64_0 12.410n ± 0% 7.194n ± 0% -42.03% (p=0.000 n=20) MemmoveUnalignedSrcDst/f_64_1 18.410n ± 0% 7.604n ± 0% -58.70% (p=0.000 n=20) MemmoveUnalignedSrcDst/b_64_1 18.410n ± 0% 7.604n ± 0% -58.70% (p=0.000 n=20) MemmoveUnalignedSrcDst/f_64_4 18.410n ± 0% 7.604n ± 0% -58.70% (p=0.000 n=20) MemmoveUnalignedSrcDst/b_64_4 18.810n ± 0% 7.604n ± 0% -59.57% (p=0.000 n=20) MemmoveUnalignedSrcDst/f_64_7 18.010n ± 0% 7.604n ± 0% -57.78% (p=0.000 n=20) MemmoveUnalignedSrcDst/b_64_7 18.810n ± 0% 7.604n ± 0% -59.57% (p=0.000 n=20) MemmoveUnalignedSrcDst/f_256_0 31.62n ± 0% 14.19n ± 0% -55.12% (p=0.000 n=20) MemmoveUnalignedSrcDst/b_256_0 32.02n ± 0% 13.61n ± 0% -57.50% (p=0.000 n=20) MemmoveUnalignedSrcDst/f_256_1 38.02n ± 0% 18.20n ± 0% -52.13% (p=0.000 n=20) MemmoveUnalignedSrcDst/b_256_1 38.02n ± 0% 18.41n ± 0% -51.58% (p=0.000 n=20) MemmoveUnalignedSrcDst/f_256_4 38.02n ± 0% 17.21n ± 0% -54.73% (p=0.000 n=20) MemmoveUnalignedSrcDst/b_256_4 38.42n ± 0% 16.81n ± 0% -56.25% (p=0.000 n=20) MemmoveUnalignedSrcDst/f_256_7 37.62n ± 0% 15.61n ± 0% -58.51% (p=0.000 n=20) MemmoveUnalignedSrcDst/b_256_7 38.42n ± 0% 15.01n ± 0% -60.93% (p=0.000 n=20) MemmoveUnalignedSrcDst/f_4096_0 415.8n ± 0% 111.1n ± 0% -73.28% (p=0.000 n=20) MemmoveUnalignedSrcDst/b_4096_0 416.2n ± 0% 110.5n ± 0% -73.45% (p=0.000 n=20) MemmoveUnalignedSrcDst/f_4096_1 422.2n ± 0% 114.3n ± 0% -72.93% (p=0.000 n=20) MemmoveUnalignedSrcDst/b_4096_1 422.2n ± 0% 114.7n ± 0% -72.83% (p=0.000 n=20) MemmoveUnalignedSrcDst/f_4096_4 422.2n ± 0% 113.3n ± 0% -73.16% (p=0.000 n=20) MemmoveUnalignedSrcDst/b_4096_4 422.6n ± 0% 113.1n ± 0% -73.24% (p=0.000 n=20) MemmoveUnalignedSrcDst/f_4096_7 421.8n ± 0% 111.7n ± 0% -73.52% (p=0.000 n=20) MemmoveUnalignedSrcDst/b_4096_7 422.6n ± 0% 111.7n ± 0% -73.57% (p=0.000 n=20) MemmoveUnalignedSrcDst/f_65536_0 6.568µ ± 0% 4.869µ ± 0% -25.88% (p=0.000 n=20) MemmoveUnalignedSrcDst/b_65536_0 6.568µ ± 0% 5.009µ ± 0% -23.74% (p=0.000 n=20) MemmoveUnalignedSrcDst/f_65536_1 6.574µ ± 0% 4.743µ ± 0% -27.85% (p=0.000 n=20) MemmoveUnalignedSrcDst/b_65536_1 6.574µ ± 0% 4.770µ ± 0% -27.44% (p=0.000 n=20) MemmoveUnalignedSrcDst/f_65536_4 6.574µ ± 0% 4.758µ ± 0% -27.63% (p=0.000 n=20) MemmoveUnalignedSrcDst/b_65536_4 6.574µ ± 0% 4.768µ ± 0% -27.48% (p=0.000 n=20) MemmoveUnalignedSrcDst/f_65536_7 6.574µ ± 0% 4.757µ ± 0% -27.64% (p=0.000 n=20) MemmoveUnalignedSrcDst/b_65536_7 6.574µ ± 0% 4.583µ ± 0% -30.29% (p=0.000 n=20) MemmoveUnalignedSrcOverlap/32 30.410n ± 0% 6.804n ± 0% -77.63% (p=0.000 n=20) MemmoveUnalignedSrcOverlap/64 56.03n ± 0% 10.01n ± 0% -82.13% (p=0.000 n=20) MemmoveUnalignedSrcOverlap/128 107.30n ± 0% 14.01n ± 0% -86.94% (p=0.000 n=20) MemmoveUnalignedSrcOverlap/256 209.70n ± 0% 13.43n ± 1% -93.60% (p=0.000 n=20) MemmoveUnalignedSrcOverlap/512 414.60n ± 0% 22.23n ± 0% -94.64% (p=0.000 n=20) MemmoveUnalignedSrcOverlap/1024 828.40n ± 0% 37.62n ± 0% -95.46% (p=0.000 n=20) MemmoveUnalignedSrcOverlap/2048 1648.00n ± 0% 68.04n ± 0% -95.87% (p=0.000 n=20) MemmoveUnalignedSrcOverlap/4096 3287.0n ± 0% 128.9n ± 0% -96.08% (p=0.000 n=20) geomean 48.94n 13.58n -72.26% The relevant performance improved by 72.26%. Change-Id: If2d3e09c3d687e733e6ff2c50feb8d6a8eb7e63b Reviewed-on: https://go-review.googlesource.com/c/go/+/589537 Reviewed-by: Tim King <taking@google.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Auto-Submit: Tim King <taking@google.com>
This commit is contained in:
parent
eb975601a0
commit
547baafa37
@ -6,99 +6,266 @@
|
||||
|
||||
// See memmove Go doc for important implementation constraints.
|
||||
|
||||
// Register map
|
||||
//
|
||||
// to R4
|
||||
// from R5
|
||||
// n(aka count) R6
|
||||
// to-end R7
|
||||
// from-end R8
|
||||
// data R11-R18
|
||||
// tmp R9
|
||||
|
||||
// Algorithm:
|
||||
//
|
||||
// Memory alignment check is only performed for copy size greater
|
||||
// than 64 bytes to minimize overhead.
|
||||
//
|
||||
// when copy size <= 64 bytes, jump to label tail, according to the
|
||||
// copy size to select the appropriate case and copy directly.
|
||||
// Based on the common memory access instructions of loong64, the
|
||||
// currently implemented cases are:
|
||||
// move_0, move_1, move_2, move_3, move_4, move_5through7, move_8,
|
||||
// move_9through16, move_17through32, move_33through64
|
||||
//
|
||||
// when copy size > 64 bytes, use the destination-aligned copying,
|
||||
// adopt the following strategy to copy in 3 parts:
|
||||
// 1. Head: do the memory alignment
|
||||
// 2. Body: a 64-byte loop structure
|
||||
// 3. Tail: processing of the remaining part (<= 64 bytes)
|
||||
//
|
||||
// forward:
|
||||
//
|
||||
// Dst NewDst Dstend
|
||||
// | |<----count after correction---->|
|
||||
// |<-------------count before correction---------->|
|
||||
// |<--8-(Dst&7)-->| |<---64 bytes--->|
|
||||
// +------------------------------------------------+
|
||||
// | Head | Body | Tail |
|
||||
// +---------------+---------------+----------------+
|
||||
// NewDst = Dst - (Dst & 7) + 8
|
||||
// count = count - 8 + (Dst & 7)
|
||||
// Src = Src - (Dst & 7) + 8
|
||||
//
|
||||
// backward:
|
||||
//
|
||||
// Dst NewDstend Dstend
|
||||
// |<-----count after correction------>| |
|
||||
// |<------------count before correction--------------->|
|
||||
// |<---64 bytes--->| |<---Dstend&7--->|
|
||||
// +----------------------------------------------------+
|
||||
// | Tail | Body | Head |
|
||||
// +----------------+------------------+----------------+
|
||||
// NewDstend = Dstend - (Dstend & 7)
|
||||
// count = count - (Dstend & 7)
|
||||
// Srcend = Srcend - (Dstend & 7)
|
||||
|
||||
// func memmove(to, from unsafe.Pointer, n uintptr)
|
||||
TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24
|
||||
BNE R6, check
|
||||
RET
|
||||
BEQ R4, R5, move_0
|
||||
BEQ R6, move_0
|
||||
|
||||
check:
|
||||
SGTU R4, R5, R7
|
||||
BNE R7, backward
|
||||
ADDV R4, R6, R7 // to-end pointer
|
||||
ADDV R5, R6, R8 // from-end pointer
|
||||
|
||||
ADDV R4, R6, R9 // end pointer
|
||||
tail:
|
||||
//copy size <= 64 bytes, copy directly, not check aligned
|
||||
|
||||
// if the two pointers are not of same alignments, do byte copying
|
||||
SUBVU R5, R4, R7
|
||||
AND $7, R7
|
||||
BNE R7, out
|
||||
// < 2 bytes
|
||||
SGTU $2, R6, R9
|
||||
BNE R9, move_1
|
||||
|
||||
// if less than 8 bytes, do byte copying
|
||||
SGTU $8, R6, R7
|
||||
BNE R7, out
|
||||
// < 3 bytes
|
||||
SGTU $3, R6, R9
|
||||
BNE R9, move_2
|
||||
|
||||
// do one byte at a time until 8-aligned
|
||||
AND $7, R4, R8
|
||||
BEQ R8, words
|
||||
MOVB (R5), R7
|
||||
// < 4 bytes
|
||||
SGTU $4, R6, R9
|
||||
BNE R9, move_3
|
||||
|
||||
// < 5 bytes
|
||||
SGTU $5, R6, R9
|
||||
BNE R9, move_4
|
||||
|
||||
// >= 5 bytes and < 8 bytes
|
||||
SGTU $8, R6, R9
|
||||
BNE R9, move_5through7
|
||||
|
||||
// < 9 bytes
|
||||
SGTU $9, R6, R9
|
||||
BNE R9, move_8
|
||||
|
||||
// >= 9 bytes and < 17 bytes
|
||||
SGTU $17, R6, R9
|
||||
BNE R9, move_9through16
|
||||
|
||||
// >= 17 bytes and < 33 bytes
|
||||
SGTU $33, R6, R9
|
||||
BNE R9, move_17through32
|
||||
|
||||
// >= 33 bytes and < 65 bytes
|
||||
SGTU $65, R6, R9
|
||||
BNE R9, move_33through64
|
||||
|
||||
// if (dst > src) && (dst < src + count), regarded as memory
|
||||
// overlap, jump to backward
|
||||
// else, jump to forward
|
||||
BGEU R5, R4, forward
|
||||
ADDV R5, R6, R10
|
||||
BLTU R4, R10, backward
|
||||
|
||||
forward:
|
||||
AND $7, R4, R9 // dst & 7
|
||||
BEQ R9, body
|
||||
head:
|
||||
MOVV $8, R10
|
||||
SUBV R9, R10 // head = 8 - (dst & 7)
|
||||
MOVB (R5), R11
|
||||
SUBV $1, R10
|
||||
ADDV $1, R5
|
||||
MOVB R7, (R4)
|
||||
MOVB R11, (R4)
|
||||
ADDV $1, R4
|
||||
JMP -6(PC)
|
||||
BNE R10, -5(PC)
|
||||
ADDV R9, R6
|
||||
ADDV $-8, R6 // newcount = count + (dst & 7) - 8
|
||||
// if newcount < 65 bytes, use move_33through64 to copy is enough
|
||||
SGTU $65, R6, R9
|
||||
BNE R9, move_33through64
|
||||
|
||||
words:
|
||||
// do 8 bytes at a time if there is room
|
||||
ADDV $-7, R9, R6 // R6 is end pointer-7
|
||||
|
||||
PCALIGN $16
|
||||
SGTU R6, R4, R8
|
||||
BEQ R8, out
|
||||
MOVV (R5), R7
|
||||
ADDV $8, R5
|
||||
MOVV R7, (R4)
|
||||
ADDV $8, R4
|
||||
JMP -6(PC)
|
||||
|
||||
out:
|
||||
BEQ R4, R9, done
|
||||
MOVB (R5), R7
|
||||
ADDV $1, R5
|
||||
MOVB R7, (R4)
|
||||
ADDV $1, R4
|
||||
JMP -5(PC)
|
||||
done:
|
||||
RET
|
||||
body:
|
||||
MOVV (R5), R11
|
||||
MOVV 8(R5), R12
|
||||
MOVV 16(R5), R13
|
||||
MOVV 24(R5), R14
|
||||
MOVV 32(R5), R15
|
||||
MOVV 40(R5), R16
|
||||
MOVV 48(R5), R17
|
||||
MOVV 56(R5), R18
|
||||
MOVV R11, (R4)
|
||||
MOVV R12, 8(R4)
|
||||
MOVV R13, 16(R4)
|
||||
MOVV R14, 24(R4)
|
||||
MOVV R15, 32(R4)
|
||||
MOVV R16, 40(R4)
|
||||
MOVV R17, 48(R4)
|
||||
MOVV R18, 56(R4)
|
||||
ADDV $-64, R6
|
||||
ADDV $64, R4
|
||||
ADDV $64, R5
|
||||
SGTU $64, R6, R9
|
||||
// if the remaining part >= 64 bytes, jmp to body
|
||||
BEQ R9, body
|
||||
// if the remaining part == 0 bytes, use move_0 to return
|
||||
BEQ R6, move_0
|
||||
// if the remaining part in (0, 63] bytes, jmp to tail
|
||||
JMP tail
|
||||
|
||||
// The backward copy algorithm is the same as the forward copy,
|
||||
// except for the direction.
|
||||
backward:
|
||||
ADDV R6, R5 // from-end pointer
|
||||
ADDV R4, R6, R9 // to-end pointer
|
||||
AND $7, R7, R9 // dstend & 7
|
||||
BEQ R9, b_body
|
||||
b_head:
|
||||
MOVV -8(R8), R11
|
||||
SUBV R9, R6 // newcount = count - (dstend & 7)
|
||||
SUBV R9, R8 // newsrcend = srcend - (dstend & 7)
|
||||
MOVV -8(R8), R12
|
||||
MOVV R11, -8(R7)
|
||||
SUBV R9, R7 // newdstend = dstend - (dstend & 7)
|
||||
MOVV R12, -8(R7)
|
||||
SUBV $8, R6
|
||||
SUBV $8, R7
|
||||
SUBV $8, R8
|
||||
SGTU $65, R6, R9
|
||||
BNE R9, move_33through64
|
||||
|
||||
// if the two pointers are not of same alignments, do byte copying
|
||||
SUBVU R9, R5, R7
|
||||
AND $7, R7
|
||||
BNE R7, out1
|
||||
b_body:
|
||||
MOVV -8(R8), R11
|
||||
MOVV -16(R8), R12
|
||||
MOVV -24(R8), R13
|
||||
MOVV -32(R8), R14
|
||||
MOVV -40(R8), R15
|
||||
MOVV -48(R8), R16
|
||||
MOVV -56(R8), R17
|
||||
MOVV -64(R8), R18
|
||||
MOVV R11, -8(R7)
|
||||
MOVV R12, -16(R7)
|
||||
MOVV R13, -24(R7)
|
||||
MOVV R14, -32(R7)
|
||||
MOVV R15, -40(R7)
|
||||
MOVV R16, -48(R7)
|
||||
MOVV R17, -56(R7)
|
||||
MOVV R18, -64(R7)
|
||||
ADDV $-64, R6
|
||||
ADDV $-64, R7
|
||||
ADDV $-64, R8
|
||||
SGTU $64, R6, R9
|
||||
BEQ R9, b_body
|
||||
BEQ R6, move_0
|
||||
JMP tail
|
||||
|
||||
// if less than 8 bytes, do byte copying
|
||||
SGTU $8, R6, R7
|
||||
BNE R7, out1
|
||||
|
||||
// do one byte at a time until 8-aligned
|
||||
AND $7, R9, R8
|
||||
BEQ R8, words1
|
||||
ADDV $-1, R5
|
||||
MOVB (R5), R7
|
||||
ADDV $-1, R9
|
||||
MOVB R7, (R9)
|
||||
JMP -6(PC)
|
||||
|
||||
words1:
|
||||
// do 8 bytes at a time if there is room
|
||||
ADDV $7, R4, R6 // R6 is start pointer+7
|
||||
|
||||
PCALIGN $16
|
||||
SGTU R9, R6, R8
|
||||
BEQ R8, out1
|
||||
ADDV $-8, R5
|
||||
MOVV (R5), R7
|
||||
ADDV $-8, R9
|
||||
MOVV R7, (R9)
|
||||
JMP -6(PC)
|
||||
|
||||
out1:
|
||||
BEQ R4, R9, done1
|
||||
ADDV $-1, R5
|
||||
MOVB (R5), R7
|
||||
ADDV $-1, R9
|
||||
MOVB R7, (R9)
|
||||
JMP -5(PC)
|
||||
done1:
|
||||
move_0:
|
||||
RET
|
||||
|
||||
move_1:
|
||||
MOVB (R5), R11
|
||||
MOVB R11, (R4)
|
||||
RET
|
||||
move_2:
|
||||
MOVH (R5), R11
|
||||
MOVH R11, (R4)
|
||||
RET
|
||||
move_3:
|
||||
MOVH (R5), R11
|
||||
MOVB -1(R8), R12
|
||||
MOVH R11, (R4)
|
||||
MOVB R12, -1(R7)
|
||||
RET
|
||||
move_4:
|
||||
MOVW (R5), R11
|
||||
MOVW R11, (R4)
|
||||
RET
|
||||
move_5through7:
|
||||
MOVW (R5), R11
|
||||
MOVW -4(R8), R12
|
||||
MOVW R11, (R4)
|
||||
MOVW R12, -4(R7)
|
||||
RET
|
||||
move_8:
|
||||
MOVV (R5), R11
|
||||
MOVV R11, (R4)
|
||||
RET
|
||||
move_9through16:
|
||||
MOVV (R5), R11
|
||||
MOVV -8(R8), R12
|
||||
MOVV R11, (R4)
|
||||
MOVV R12, -8(R7)
|
||||
RET
|
||||
move_17through32:
|
||||
MOVV (R5), R11
|
||||
MOVV 8(R5), R12
|
||||
MOVV -16(R8), R13
|
||||
MOVV -8(R8), R14
|
||||
MOVV R11, (R4)
|
||||
MOVV R12, 8(R4)
|
||||
MOVV R13, -16(R7)
|
||||
MOVV R14, -8(R7)
|
||||
RET
|
||||
move_33through64:
|
||||
MOVV (R5), R11
|
||||
MOVV 8(R5), R12
|
||||
MOVV 16(R5), R13
|
||||
MOVV 24(R5), R14
|
||||
MOVV -32(R8), R15
|
||||
MOVV -24(R8), R16
|
||||
MOVV -16(R8), R17
|
||||
MOVV -8(R8), R18
|
||||
MOVV R11, (R4)
|
||||
MOVV R12, 8(R4)
|
||||
MOVV R13, 16(R4)
|
||||
MOVV R14, 24(R4)
|
||||
MOVV R15, -32(R7)
|
||||
MOVV R16, -24(R7)
|
||||
MOVV R17, -16(R7)
|
||||
MOVV R18, -8(R7)
|
||||
RET
|
||||
|
Loading…
Reference in New Issue
Block a user